Aim

35,500 reporters for 86 TFs were transfected into 9 different cell types + tested across ~100 perturbation conditions. In this script I will analyze TF reporter activities in detail and review how individual reporters respond to TF perturbations.

Description of input data

  • commercial_reporter = positive control reporter sequence from commercial source
  • neg_ctrls = reporter with mutated TF binding sites?
  • hPGK = positive control, chunk from hPGK promoter (only in lib 1)
  • native_enhancer = genomic mES enhancer sequence chunks from Miguels libraries (only lib 1)
  • reporter_activity_minP = mean activity relative to median of negative control reporters (per core promoter)
  • reporter_activity_neg = mean activity relative to its paired negative control (identical reporter with mutated binding sites)

Setup

Libraries

knitr::opts_chunk$set(echo = TRUE)
StartTime <-Sys.time()

# 8-digit Date tag:
Date <- substr(gsub("-","",Sys.time()),1,8) 
# libraries:
library(RColorBrewer)
library(ggplot2)
library(dplyr)
library(maditr)
library(tibble)
library(pheatmap)
library(ggpubr)
library(visNetwork)
library(ggbeeswarm)
library(ggforce)
library(viridis)
library(plyr)
library(igraph)
library(ggraph)
library(cowplot)
library(gridExtra)
library(pROC)
library(tidyr)
library(stringr)
library(randomForest)
library(ggrastr)
library(readr)
library(ggbiplot)
library(IHW)
library(biomaRt)
library(ggh4x)
library(ggiraph)
library(plotly)
library(AMR)
library(umap)
library(ggrepel)

Functions

### Custom functions
SetFileName <- function(filename, initials) {
  # Set filename with extension and initials to make filename with date integrated.
  filename <- substitute(filename)
  initials <- substitute(initials)
  filename <- paste0(initials, Date, filename)
  filename
}


# Extract p-value from linear model
lmp <- function (modelobject) {
    if (class(modelobject) != "lm") stop("Not an object of class 'lm' ")
    f <- summary(modelobject)$fstatistic
    p <- pf(f[1],f[2],f[3],lower.tail=F)
    attributes(p) <- NULL
    return(p)
}

# Set custom ggplot2 theme and custom colors
theme_classic_lines <- function() {
  theme_pubr(border = F, legend = "top") +
            theme(panel.grid.major = element_line(colour = "#adb5bd", linewidth = 0.25),
                  strip.background = element_rect(fill = "#ced4da")
            )
}

theme_classic_lines_45 <- function() {
  theme_pubr(border = T, legend = "top", x.text.angle = 45) +
            theme(panel.grid.major = element_line(colour = "#adb5bd", linewidth = 0.25),
                  strip.background = element_rect(fill = "#ced4da")
            )
}

theme_classic_lines_90 <- function() {
  theme_pubr(border = T, legend = "top", x.text.angle = 90) +
            theme(panel.grid.major = element_line(colour = "#adb5bd", linewidth = 0.25),
                  strip.background = element_rect(fill = "#ced4da")
            )
}

theme_barplot <- function() {
  theme_pubr(border = T, legend = "none") +
            theme(panel.grid.major.x = element_line(colour = "black", linewidth = 0.25),
                  strip.background = element_blank(),
                  strip.text = element_text(face="bold", hjust=0)
            )
}

theme_set(theme_classic_lines())

colors_diverse <- c("#264653", "#9AC1AE", "#5D987B", "#f2cc8f", "#e76f51")

cell_colors <- c("A549" = "#f2cc8f", "HCT116" = "#ED1C24", "HEK293" = "#00B4D8", 
                               "HepG2" = "#B3B3B3", "K562" = "#A67C52", "MCF7" = "#81B29A", 
                               "U2OS" = "#3D405B", "mES" = "#EAB69F", "NPC" = "#E07A5F")

ggplot_custom <- function(...) ggplot2::ggplot(...) + 
  scale_color_manual(values = colors_diverse) + 
  scale_fill_manual(values = colors_diverse)

Loading data

# Import processed bc counts from the preprocessing step
cDNA_df <- read.csv("/DATA/usr/m.trauernicht/projects/SuRE-TF/data/gcf7124_stimulations/results/mt20240305_reporter_activity_filt_combined.csv", header = T)

## We are not going to use NIH3T3 data, so remove it for now
cDNA_df <- cDNA_df %>%
  filter(cell != "NIH3T3")

## Rename stimulation status of control conditions
cDNA_df$stimulation[is.na(cDNA_df$stimulation)] <- "no"

# Load RNA-seq data
tf_rna <- read_tsv("/DATA/usr/m.trauernicht/data/RNA_seq/rna_tpm_all_tfs.tsv")
## Rows: 199704 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): tf, cell
## dbl (3): TPM, TPM_norm, nTPM
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Prepare data frame for following analyses
cDNA_df2 <- cDNA_df %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  dplyr::filter(neg_ctrls == "No", 
                #commercial_reporter == "No", 
                hPGK == "No", 
                str_detect(tf, "RANDOM", negate = T), 
                native_enhancer == "No") %>%
  distinct(tf, condition, stimulation, reference_condition, tf_target, effect_size, off_target,
           cell, reporter_id, commercial_reporter, reporter_activity_minP, gcf, reporter_dif_minP) %>%
  mutate(reporter_activity_minP = log2(reporter_activity_minP)) %>%
  distinct()

## Define optimal candidate conditions for each TF: either the highest expressing cell line, or the stimulated condition
### Select most active stimulation condition per TF
ref_conditions1 <- cDNA_df2 %>%
  filter(tf_target == 1) %>%
  distinct(tf, condition, cell, reporter_id, commercial_reporter, reporter_activity_minP) %>%
  group_by(tf, condition) %>%
  dplyr::arrange(desc(reporter_activity_minP)) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(median_reporter_activity_minP = ave(reporter_activity_minP, tf, condition, FUN = median)) %>%
  group_by(tf) %>%
  arrange(desc(median_reporter_activity_minP)) %>%
  top_n(1) %>%
  ungroup() %>%
  distinct(tf, condition)
## Selecting by reporter_activity_minP
## Selecting by median_reporter_activity_minP
### Top 3 conditions per TF based on highest TPM and highest median reporter activity
ref_conditions2 <- merge(cDNA_df2 %>% filter(stimulation == "no"), 
                         tf_rna %>% mutate(tf = toupper(tf)), by = c("cell", "tf")) %>%
  mutate(mean_reporter_activity_minP = ave(reporter_activity_minP, tf, condition, FUN = median)) %>%
  distinct(mean_reporter_activity_minP, tf, condition, nTPM) %>%
  #filter(!condition %in% c("mES_Nanog_ctrl", "mES_Pou5f1_ctrl", "mES_Sox2_AID_UMI", "mES_Sox2_ctrl", "mES_NT", "HepG2_NT")) %>%
  group_by(tf) %>%
  dplyr::arrange(desc(nTPM)) %>%
  top_n(n = 3, wt = nTPM) %>%
  ungroup() %>%
  group_by(tf) %>%
  dplyr::arrange(desc(mean_reporter_activity_minP)) %>%
  top_n(n = 2, wt = mean_reporter_activity_minP) %>%
  ungroup() %>%
  distinct(tf, condition)

## Now I combine both to have 2-3 top conditions per TF
ref_conditions <- rbind(ref_conditions1, ref_conditions2) %>%
  rbind(data.frame(tf = c("PAX6", "ESRRB", "SMAD4", "GATA1", "RUNX2", "NR5A2", "SMAD2::SMAD3::SMAD4", "RFX1",
                          "FOXO1", "FOSL1", "ETS2", "ONECUT1"),
                   condition = c("HepG2_NT", "mES_serum_2i_LIF", "HepG2_NT", "K562_Hemin", "U2OS", "mES_2i_LIF", "HepG2_NT", "mES_2i_LIF",
                                 "mES_FOXA1-OE", "mES_2i_LIF", "HepG2_NT", "HepG2_NT")))

## Data frame with off-target activities
off_target_activities <- cDNA_df2 %>% 
  filter(tf_target == 2) %>%
  filter(tf != "SOX2" | (tf == "SOX2" & condition != "mES_POU2F1")) %>%
  mutate(effect_size = as.numeric(effect_size)) %>%
  mutate(effect_size = ifelse(is.na(effect_size) == T, 0, effect_size)) %>%
  mutate(reporter_dif_mean = ave(reporter_dif_minP, tf, condition, FUN = function(x) median(x, na.rm = T))) %>% 
  mutate(reporter_dif_mean = ifelse(effect_size == 0, -reporter_dif_mean, reporter_dif_mean)) %>%
  ## For some TFs I have multiple off-target perturbations: keep strongest effect only
  group_by(tf) %>%
  arrange(desc(reporter_dif_mean)) %>% 
  top_n(1, wt = reporter_dif_mean) %>%
  ungroup() %>%
  dplyr::select(reporter_id, tf, commercial_reporter, "off_target_activity" = reporter_dif_minP, condition, effect_size) %>%
  distinct()

on_target_activities <- cDNA_df2 %>% 
  filter(tf_target == 1) %>%
  mutate(effect_size = as.numeric(effect_size)) %>%
  mutate(reporter_dif_mean = ave(reporter_dif_minP, tf, condition, FUN = function(x) median(x, na.rm = T))) %>% 
  mutate(reporter_dif_mean = ifelse(effect_size == 0, -reporter_dif_mean, reporter_dif_mean)) %>%
  ## Some TFs have multiple perturbations: keep strongest effect only
  group_by(tf) %>%
  arrange(desc(reporter_dif_mean)) %>% 
  top_n(1, wt = reporter_dif_mean) %>%
  ungroup() %>%
  dplyr::select(reporter_id, tf, commercial_reporter, reporter_dif_minP, condition, effect_size) %>%
  distinct()

Quality checks

Aim: Characterize reporter activities in the nine probed cell lines.

## Figure S1E: Activities per cell type - density
cDNA_df2 <- cDNA_df %>%
  filter(stimulation == "no",
         commercial_reporter == "No", 
         hPGK == "No", 
         str_detect(tf, "RANDOM", negate = T), 
         native_enhancer == "No") %>%
  distinct(tf, reporter_activity_minP, cell, reporter_id, neg_ctrls) %>%
  mutate(reporter_activity_minP = ave(reporter_activity_minP, cell, reporter_id, FUN = mean)) %>%
  mutate(reporter_activity_minP = log2(reporter_activity_minP)) %>%
  distinct()  

ggplot(cDNA_df2 %>%
         mutate(cell = factor(cell, levels = c("A549", "K562", "HCT116", "MCF7", "HepG2", 
                                               "U2OS", "HEK293", "mES", "NPC", "NIH3T3"))),
       aes(x = reporter_activity_minP, fill = neg_ctrls)) +
  geom_density(alpha = .4) +
  facet_wrap(~cell, ncol = 2) +
  scale_fill_manual(values = c("Yes" = "grey70", "No" = "#DD6B48")) +
  theme_pubr(border = T) +
  ggtitle("Figure S1E: Activity distribution per cell type")

## Figure S1F: Compare TF reporters to native enhancer controls
native_activities <- cDNA_df %>%
  filter(condition == "mES_2i_LIF", neg_ctrls == "No") %>%
  distinct(reporter_id, tf, reporter_activity_minP, condition) %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  mutate(native_enhancer = ifelse(tf %in% c("e97", "e6", "e19", "e11", "e93"), tf, "tf_reporter")) %>%
  mutate(native_enhancer2 = ifelse(native_enhancer == "tf_reporter", native_enhancer, "native_enhancer"))

ggplot(native_activities,
       aes(x = native_enhancer2, y = log2(reporter_activity_minP))) +
  geom_quasirandom_rast(raster.dpi = 600, color = "black", size = .5) +
  stat_summary(fun = median, fun.min = median, fun.max = median, geom = "crossbar", color = "red", width = 0.25, lwd = 0.4) +
  theme_pubr() +
  ggtitle("Figure S1F: Comparison of TF reporters to native enhancer controls")

## Figure S2: Median TF activities in all 9 cell types
tf_activities_median <- cDNA_df %>%
  filter(stimulation == "no",
         neg_ctrls == "No",
         commercial_reporter == "No", 
         hPGK == "No", 
         str_detect(tf, "RANDOM", negate = T), 
         native_enhancer == "No") %>%
  distinct(tf, reporter_activity_minP, cell, neg_ctrls) %>%
  mutate(reporter_activity_minP = ave(reporter_activity_minP, cell, tf, FUN = median)) %>%
  mutate(reporter_activity_minP = log2(reporter_activity_minP)) %>%
  distinct(reporter_activity_minP, tf, cell)

ggplot(tf_activities_median,
       aes(x = cell, y = reporter_activity_minP, fill = reporter_activity_minP)) +
  geom_bar(stat = "identity") +
  facet_wrap(~tf) +
  scale_fill_gradient2(low = "#99B2DD", mid = "white", high = "#E07A5F", midpoint = 0, limits = c(-0.5,5), oob = squish) +
  theme_pubr(x.text.angle = 90, border = T) +
  ggtitle("Figure S2: Median TF activities in all 9 cell types")

## Per individual reporter
# tf_reporter_activities_median <- cDNA_df %>%
#   filter(stimulation == "no",
#          neg_ctrls == "No",
#          commercial_reporter == "No", 
#          hPGK == "No", 
#          str_detect(tf, "RANDOM", negate = T), 
#          native_enhancer == "No") %>%
#   distinct(tf, reporter_activity_minP, reporter_id, cell) %>%
#   mutate(reporter_activity_minP = ave(reporter_activity_minP, reporter_id, cell, tf, FUN = median)) %>%
#   mutate(reporter_activity_minP = log2(reporter_activity_minP)) %>%
#   distinct(reporter_activity_minP, tf, reporter_id, cell)
# 
# ggplot(tf_reporter_activities_median,
#        aes(x = cell, y = reporter_activity_minP)) +
#   geom_boxplot() +
#   facet_wrap(~tf) +
#   theme_pubr(x.text.angle = 90, border = T)

Explore reporter fold-changes upon perturbations per TF

fc_df <- cDNA_df %>%
  filter(stimulation != "no", neg_ctrls == "No", native_enhancer == "No", hPGK == "No") %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  distinct(reporter_id, tf, reporter_dif_minP, condition, tf_target) %>%
  mutate(mean_dif = ave(reporter_dif_minP, tf, condition, FUN = function(x) mean(x, na.rm = T))) %>%
  mutate(tf_length = ave(tf, tf, FUN = length)) %>%
  filter(tf_length > 10)

# for (i in unique(fc_df$tf)) {
#   p <- ggplot(fc_df %>% filter(tf == i),
#          aes(x = reorder(condition, -mean_dif), y = reporter_dif_minP, color = as.character(tf_target))) +
#     geom_hline(yintercept = 0, lty = 2) +
#     geom_hline(yintercept = 1) +
#     geom_hline(yintercept = -1) +
#     geom_point() +
#     scale_color_manual(values = c("0" = "black", "1" = "red", "2" = "blue")) +
#     theme_pubr(x.text.angle = 90) +
#     ggtitle(i)
# 
#   print(p)
# }
# 
# for (i in unique(fc_df$condition)) {
#   p <- ggplot(fc_df %>% filter(condition == i),
#          aes(x = reorder(tf, -mean_dif), y = reporter_dif_minP, color = as.character(tf_target))) +
#     geom_hline(yintercept = 0, lty = 2) +
#     geom_hline(yintercept = 1) +
#     geom_hline(yintercept = -1) +
#     geom_point() +
#     scale_color_manual(values = c("0" = "black", "1" = "red", "2" = "blue")) +
#     theme_pubr(x.text.angle = 90) +
#     ggtitle(i)
# 
#   print(p)
# }

Correlate with RNA-seq data

Aim: Does the reporter activity correlate with transcript abundance across conditions?

## Prepare tf reporter data frame
cDNA_df2 <- cDNA_df %>%
  #filter(tf != "RFX1") %>%
  dplyr::filter(neg_ctrls == "No",
                #commercial_reporter == "No",
                stimulation == "no",
                hPGK == "No",
                str_detect(tf, "RANDOM", negate = T),
                native_enhancer == "No") %>%
  distinct(tf, reporter_activity_minP, cell, reporter_id, commercial_reporter, spacing, promoter, background, distance) %>%
  mutate(reporter_activity_minP = ave(reporter_activity_minP, cell, reporter_id, FUN = mean)) %>%
  mutate(reporter_activity_minP = log2(reporter_activity_minP)) %>%
  distinct() %>%
  mutate(tf = gsub("_.*", "", tf))

## Combine TF reporter data with RNA-seq data
cDNA_df3 <- cDNA_df2 %>%
  left_join(tf_rna) %>%
  mutate(TPM = ifelse(is.na(TPM), 0, TPM)) %>%
  mutate(nTPM = ifelse(is.na(nTPM), 0, nTPM)) %>%
  mutate(log_nTPM = log2(nTPM + 0.01)) %>%
  mutate(range_TPM = quantile(log_nTPM, probs = 0.95) - quantile(log_nTPM, probs = 0.05)) %>%
  mutate(range_activity = quantile(reporter_activity_minP, probs = 0.95) - quantile(reporter_activity_minP, probs = 0.05)) %>%
  mutate(dif_range = range_TPM / range_activity) %>%
  mutate(log_nTPM = log_nTPM / dif_range) %>%
  dplyr::select(-range_TPM, -range_activity, -dif_range)
## Joining, by = c("cell", "tf")
## Compute pearson correlations for each individual reporter of reporter activity vs. nTPM counts across the nine cell types
for (i in unique(cDNA_df3$reporter_id)) {
  if (nrow(cDNA_df3[cDNA_df3$reporter_id == i,]) > 4) {
    
    #print(i)
    
    cDNA_df3$cor_pval[cDNA_df3$reporter_id == i] <- cor.test(2^cDNA_df3$reporter_activity_minP[cDNA_df3$reporter_id == i],
                                                                    cDNA_df3$nTPM[cDNA_df3$reporter_id == i], method = "pearson", exact = F)$p.value
    
    cDNA_df3$cor[cDNA_df3$reporter_id == i] <- cor(2^cDNA_df3$reporter_activity_minP[cDNA_df3$reporter_id == i], 
                                                 cDNA_df3$nTPM[cDNA_df3$reporter_id == i], method = "pearson")
  }
  else {
    cDNA_df3$cor_pval[cDNA_df3$reporter_id == i] <- 1
    cDNA_df3$cor[cDNA_df3$reporter_id == i] <- 0
  }
}

## I am only interested in positive correlations, so set all negative correlations to zero
cDNA_df3 <- cDNA_df3 %>%
  mutate(cor_pval = ifelse(cor_pval == cor, 1, cor_pval)) %>%
  group_by(tf) %>%
  mutate(cor_pval_fdr = p.adjust(cor_pval, method = "fdr")) %>%
  ungroup() %>%
  mutate(cor_pval = ifelse(cor < 0, 1, cor_pval)) %>%
  mutate(cor_pval_fdr = ifelse(cor < 0, 1, cor_pval_fdr)) %>%
  mutate(cor_pval = -log10(cor_pval)) %>%
  mutate(cor_pval_fdr = -log10(cor_pval_fdr)) %>%
  mutate(cor_pval = ifelse(is.na(cor_pval), 1, cor_pval)) %>%
  mutate(cor_pval_fdr = ifelse(is.na(cor_pval_fdr), 1, cor_pval_fdr)) %>%
  mutate(cor = ifelse(is.na(cor), 1, cor))

## Filter for TFs where correlations actually make sense -> minimum TPM > 0.5, maximum TPM < 0.5, dif_TPM > 5
rna_correlations_df <- cDNA_df3 %>%
  mutate(min_TPM = ave(nTPM, tf, FUN = min),
         max_TPM = ave(nTPM, tf, FUN = max)) %>%
  mutate(dif_TPM = max_TPM - min_TPM) %>%
  mutate(mean_tf_activity = ave(reporter_activity_minP, tf, cell, FUN = median)) %>%
  mutate(max_mean_tf_activity = ave(mean_tf_activity, tf, FUN = max)) %>%
  mutate(reasonable = ifelse(max_TPM > 8 &
                               dif_TPM > 5 &
                               min_TPM < 1 &
                               max_mean_tf_activity > 0.75 |
                               tf %in% c("GLI1", "STAT3", "SP1", "TEAD1", "NFKB1",
                                         "ZFX", "NR4A1"), "Yes", "No")) %>%
  mutate(reasonable = ifelse(tf %in% c("TCF7L2", "TCF7", "TFEB", "TP53", "GATA3", "CEBPA"), "No", reasonable))

# rna_correlations_df <- cDNA_df3 %>%
#   mutate(min_TPM = ave(nTPM, tf, FUN = min),
#          max_TPM = ave(nTPM, tf, FUN = max)) %>%
#   mutate(dif_TPM = max_TPM / min_TPM) %>%
#   mutate(mean_tf_activity = ave(reporter_activity_minP, tf, cell, FUN = mean)) %>%
#   mutate(max_mean_tf_activity = ave(mean_tf_activity, tf, FUN = max)) %>%
#   mutate(max_pval = ave(cor_pval, tf, FUN = max)) %>%
#   mutate(reasonable = ifelse(max_TPM > 8 & 
#                                min_TPM < 1 &
#                                max_mean_tf_activity > 0.75 | 
#                                max_pval > 1, "Yes", "No") )

reasonable_corr <- rna_correlations_df %>%
  filter(reasonable == "No")

## Export correlations
rna_correlations <- rna_correlations_df %>%
  filter(reasonable == "Yes") %>%
  distinct(reporter_id, cor_pval, cor) %>%
  mutate(cor_pval = ifelse(reporter_id %in% reasonable_corr$reporter_id, 0, cor_pval)) %>%
  mutate(cor = ifelse(reporter_id %in% reasonable_corr$reporter_id, 0, cor))

## Exclude TFs with only commercial reporters
rna_correlations_rem <- rna_correlations_df %>%
  filter(reasonable == "Yes") %>%
  distinct(tf, reporter_id) %>%
  mutate(n_tf = as.numeric(ave(tf, tf, FUN = length))) %>%
  filter(n_tf > 10)

## Figure 3A: Plot correlation p-values per TF (export 3x6.5)
ggplot(rna_correlations_df %>%
         filter(reasonable == "Yes", tf %in% rna_correlations_rem$tf) %>%
         distinct(reporter_id, cor_pval, cor, tf, commercial_reporter) %>%
         mutate(sign = ifelse(cor_pval > 1, "Yes", "No")) %>%
         mutate(mean_cor = ave(cor, tf, FUN = mean)),
       aes(x = reorder(tf, -mean_cor), y = cor)) +
  geom_hline(yintercept = 0, lty = 2) +
  geom_quasirandom(size = 1.5, aes(color = sign, shape = commercial_reporter)) +
  ylim(-1,1) +
  theme_pubr(border = T, x.text.angle = 90, legend = "none") +
  stat_summary(fun = median, fun.min = median, fun.max = median, geom = "crossbar", color = "red", width = 0.25, lwd = 0.4) +
  scale_color_manual(values = c("Yes" = colors_diverse[2], "No" = colors_diverse[1])) +
  ggtitle("Figure 3A: Correlation p-values per TF")

### SP1: Random core promoter (Export 4x4)
ggplot(cDNA_df3 %>%
           filter(tf == "SP1", commercial_reporter == "No")%>%
           mutate(promoter = gsub(".*(Random|minP|hBGm|mCMV).*", "\\1", reporter_id)) %>%
           distinct(cell, reporter_id, reporter_activity_minP, nTPM, promoter, cor_pval),
       aes(x = nTPM, y = 2^reporter_activity_minP)) +
  geom_vline(xintercept = 1, lty = 1) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_hline(yintercept = 1, lty = 2) +
  geom_point(size = 3, shape = 21, aes(fill = cell), color = "black") +
  scale_fill_manual(values = cell_colors) +
  geom_smooth(method = "lm", se = F, aes(group = reporter_id, color = cor_pval)) +
  theme_pubr(border = T) +
  facet_wrap(~promoter) +
  scale_color_gradient(low = "#264653", high = "#9AC1AE")
## `geom_smooth()` using formula = 'y ~ x'

### Figure 3B: GATA4: Impact of spacer sequence (export 3x5.5)
ggplot(cDNA_df3 %>%
         filter(reporter_id %in% c("GATA4_10bp_10bp_minP_3", "GATA4_10bp_10bp_minP_1")) %>%
         distinct(cell, reporter_id, reporter_activity_minP, nTPM),
       aes(x = nTPM, y = 2^reporter_activity_minP)) +
  geom_vline(xintercept = 1, lty = 1) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_hline(yintercept = 1, lty = 2) +
  geom_smooth(method = "lm", color = "black") +
  geom_point(size = 3, shape = 21, aes(fill = cell)) +
  scale_fill_manual(values = cell_colors) +
  facet_wrap(~reporter_id, ncol = 1) +
  theme_pubr(border = T) +
  ggtitle("Figure 3B: GATA4: Impact of spacer sequence")
## `geom_smooth()` using formula = 'y ~ x'

gata4_plot <- rna_correlations_df %>%
  filter(tf == "GATA4") %>%
  distinct(reporter_id, cor_pval, spacing, background) %>%
  mutate(mean_cor_pval = ave(cor_pval, spacing, background, FUN = mean))

## Figure S3A
ggplot() +
  geom_hline(yintercept = 1, lty = 2) +
  geom_bar(data = gata4_plot %>% distinct(mean_cor_pval, spacing, background),
           aes(x = as.factor(background), y = mean_cor_pval), stat = "identity", fill = "grey80") +
  geom_point(data = gata4_plot, size = 2, 
              aes(color = ifelse(cor_pval > 1, "Yes", "No"),
                  x = as.factor(background), y = cor_pval)) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("Yes" = colors_diverse[2], "No" = colors_diverse[1])) +
  facet_wrap(~spacing) +
  ggtitle("Figure S3A: GATA4: Impact of spacer sequence")

### Figure 3D: TFCP2L1: Impact of spacer length
ggplot(cDNA_df3 %>%
         filter(reporter_id %in% c("TFCP2L1_10bp_10bp_minP_1", "TFCP2L1_5bp_10bp_minP_1")) %>%
         distinct(cell, reporter_id, reporter_activity_minP, nTPM),
       aes(x = nTPM, y = 2^reporter_activity_minP)) +
  geom_vline(xintercept = 1, lty = 1) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_hline(yintercept = 1, lty = 2) +
  geom_smooth(method = "lm", color = "black") +
  geom_point(size = 3, shape = 21, aes(fill = cell)) +
  scale_fill_manual(values = cell_colors) +
  facet_wrap(~reporter_id, ncol = 1) +
  theme_pubr(border = T) +
  ggtitle("Figure 3D: TFCP2L1: Impact of spacer length")
## `geom_smooth()` using formula = 'y ~ x'

tfcp2l1_plot <- rna_correlations_df %>%
  filter(tf == "TFCP2L1") %>%
  distinct(reporter_id, cor_pval, spacing, background) %>%
  mutate(mean_cor_pval = ave(cor_pval, spacing, background, FUN = mean))

## Figure S3C: Export 3x4
ggplot() +
  geom_hline(yintercept = 1, lty = 2) +
  geom_bar(data = tfcp2l1_plot %>% distinct(mean_cor_pval, spacing, background),
           aes(x = as.factor(background), y = mean_cor_pval), stat = "identity", fill = "grey80") +
  geom_point(data = tfcp2l1_plot, size = 2, 
              aes(color = ifelse(cor_pval > 1, "Yes", "No"),
                  x = as.factor(background), y = cor_pval)) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("Yes" = colors_diverse[2], "No" = colors_diverse[1])) +
  facet_wrap(~spacing) +
  ggtitle("Figure S3C: TFCP2L1: Impact of spacer length")

### Figure 3C: GATA1: Impact of distance
ggplot(cDNA_df3 %>%
           filter(reporter_id %in% c("GATA1_5bp_10bp_mCMV_2", "GATA1_5bp_21bp_mCMV_2")) %>%
           distinct(cell, reporter_id, reporter_activity_minP, nTPM),
       aes(x = nTPM, y = 2^reporter_activity_minP)) +
  geom_vline(xintercept = 1, lty = 1) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_hline(yintercept = 1, lty = 2) +
  geom_smooth(method = "lm", color = "black") +
  geom_point(size = 3, shape = 21, aes(fill = cell)) +
  scale_fill_manual(values = cell_colors) +
  facet_wrap(~reporter_id, ncol = 1) +
  theme_pubr(border = T) +
  ggtitle("Figure 3C: GATA1: Impact of distance")
## `geom_smooth()` using formula = 'y ~ x'

gata1_plot <- rna_correlations_df %>%
  filter(tf == "GATA1") %>%
  distinct(reporter_id, cor_pval, distance) %>%
  mutate(mean_cor_pval = ave(cor_pval, distance, FUN = mean))

## Figure S3B
ggplot() +
  geom_hline(yintercept = 1, lty = 2) +
  geom_bar(data = gata1_plot %>% distinct(mean_cor_pval, distance),
           aes(x = as.factor(distance), y = mean_cor_pval), stat = "identity", fill = "grey80") +
  geom_point(data = gata1_plot, size = 2, 
              aes(color = ifelse(cor_pval > 1, "Yes", "No"),
                  x = as.factor(distance), y = cor_pval)) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("Yes" = colors_diverse[2], "No" = colors_diverse[1])) +
  ggtitle("Figure S3B: GATA1: Impact of distance")

### NR5A2: Impact of spacer length - repressive role?
ggplot(rna_correlations_df %>%
           filter(tf == "NR5A2") %>%
           distinct(reporter_id, cor, spacing),
       aes(x = as.factor(spacing), y = cor)) +
    geom_quasirandom(size = 2) +
    geom_boxplot(alpha = .2, outlier.shape = NA) +
    theme_pubr()

Reporter design affects cell-type specificity.


Add proteomics data

Aim: Similar to before: does reporter activity correlate with protein abundance across cell types?

# proteomics_df_long <- read_tsv("/DATA/usr/m.trauernicht/data/RNA_seq/protein_abundance_all_tfs.tsv")
# 
# ## Combine TF reporter data with RNA-seq data
# cDNA_df3 <- merge(cDNA_df2, proteomics_df_long, all = T) %>%
#   na.omit()
# 
# ## Compute spearman correlations for each individual reporter of reporter activity vs. TPM counts across the nine cell types
# for (i in unique(cDNA_df3$reporter_id)) {
#   if (nrow(cDNA_df3[cDNA_df3$reporter_id == i,]) > 4) {
#     
#     cDNA_df3$cor_pval[cDNA_df3$reporter_id == i] <- cor.test(cDNA_df3$reporter_activity_minP[cDNA_df3$reporter_id == i],
#                                                              cDNA_df3$protein_abundance[cDNA_df3$reporter_id == i], method = "pearson", exact = F)$p.value
#     
#     cDNA_df3$cor[cDNA_df3$reporter_id == i] <- cor(cDNA_df3$reporter_activity_minP[cDNA_df3$reporter_id == i],
#                                                    cDNA_df3$protein_abundance[cDNA_df3$reporter_id == i], method = "pearson")
#   }
#   else {
#     cDNA_df3$cor_pval[cDNA_df3$reporter_id == i] <- 1
#     cDNA_df3$cor[cDNA_df3$reporter_id == i] <- 0
#   }
# }
# 
# ## I am only interested in positive correlations, so set all negative correlations to zero
# cDNA_df3 <- cDNA_df3 %>%
#   mutate(cor_pval = ifelse(cor_pval == cor, 1, cor_pval)) %>%
#   group_by(tf) %>%
#   mutate(cor_pval_fdr = p.adjust(cor_pval, method = "fdr")) %>%
#   ungroup() %>%
#   mutate(cor_pval = ifelse(cor < 0, 1, cor_pval)) %>%
#   mutate(cor_pval_fdr = ifelse(cor < 0, 1, cor_pval_fdr)) %>%
#   mutate(cor_pval = -log10(cor_pval)) %>%
#   mutate(cor_pval_fdr = -log10(cor_pval_fdr)) %>%
#   mutate(cor_pval = ifelse(is.na(cor_pval), 1, cor_pval)) %>%
#   mutate(cor_pval_fdr = ifelse(is.na(cor_pval_fdr), 1, cor_pval_fdr)) %>%
#   mutate(cor = ifelse(is.na(cor), 1, cor))
# 
# ## Export correlations
# protein_correlations <- cDNA_df3 %>%
#   distinct(reporter_id, cor_pval, cor)
# 
# ## Correlate RNA and protein correlations
# rna_protein_cor <- merge(protein_correlations %>%
#                            dplyr::select(reporter_id, "prot_cor" = cor_pval),
#                          rna_correlations %>%
#                            dplyr::select(reporter_id, "rna_cor" = cor_pval), all = T) %>%
#   replace(is.na(.), 0)
# 
# ggplot(rna_protein_cor %>%
#          mutate(tf = gsub("(.*)_.*_.*_.*_.*", "\\1", reporter_id)) %>%
#          mutate(source = gsub(".*_(.*)", "\\1", tf)) %>%
#          mutate(source = ifelse(source == "TF-romanov", "romanov", source)) %>%
#          mutate(source = ifelse(!source %in% c("romanov", "TF-seq", "promega"), "designed", source)) %>%
#          mutate(tf = gsub("_.*", "", tf)),
#        aes(x = prot_cor, y = rna_cor)) +
#   geom_point(aes(color = source)) +
#   geom_smooth(method = "lm") +
#   theme_pubr() +
#   facet_wrap(~tf)
# 
# protein_correlations2 <- protein_correlations %>%
#   dplyr::select(reporter_id, "protein_cor" = cor)
# 
# ggplot(rna_correlations_df %>%
#          left_join(protein_correlations2) %>%
#          filter(commercial_reporter == "No") %>%
#          distinct(reporter_id, cor_pval, tf, protein_cor) %>%
#          mutate(sign_rna = ifelse(cor_pval < 1, "No", "Yes")) %>%
#          mutate(sign_protein = ifelse(protein_cor < 1, "No", "Yes")) %>%
#          mutate(mean_cor_pval = ave(cor_pval, tf, FUN = mean)),
#        aes(x = reorder(tf, -mean_cor_pval), y = cor_pval)) +
#   geom_quasirandom(aes(color = protein_cor)) +
#   geom_hline(yintercept = 1, lty = 2) +
#   scale_color_gradient2(low = "#264653", mid = "white", high = "#e76f51") +
#   theme_pubr(border = T, x.text.angle = 90)
# 
# ggplot_custom(cDNA_df3 %>%
#                 filter(commercial_reporter == "No") %>%
#                 distinct(reporter_id, cor_pval, tf, commercial_reporter) %>%
#                 mutate(mean_cor_pval = ave(cor_pval, tf, FUN = mean)) %>%
#                 mutate(sign = ifelse(cor_pval < 1.30103, "No", "Yes")) %>%
#                 left_join(rna_correlations_df %>% dplyr::select(reporter_id, "cor_rna" = cor_pval) %>% distinct()),
#        aes(x = reorder(tf, -mean_cor_pval), y = cor_pval)) +
#   geom_quasirandom(aes(color = cor_rna)) +
#   geom_hline(yintercept = 1.30103, lty = 2) +
#   scale_color_gradient(low = "grey90",high = "#e76f51") +
#   theme_pubr(border = T, x.text.angle = 90)
# 
# ggplot(rna_correlations_df %>%
#          left_join(protein_correlations2) %>%
#          filter(commercial_reporter == "No") %>%
#          distinct(reporter_id, cor, tf, protein_cor) %>%
#          mutate(sign_rna = ifelse(cor < 1, "No", "Yes")) %>%
#          mutate(sign_protein = ifelse(protein_cor < 1, "No", "Yes")) %>%
#          mutate(mean_cor_pval = ave(cor, tf, FUN = mean)),
#        aes(x = reorder(tf, -mean_cor_pval), y = cor)) +
#   geom_quasirandom(aes(color = protein_cor)) +
#   geom_hline(yintercept = 1, lty = 2) +
#   scale_color_gradient2(low = "#264653", mid = "white", high = "#e76f51") +
#   theme_pubr(border = T, x.text.angle = 90)

Some reporter activities correlate surprisingly well with protein abundance. Don’t use this for now - data not reliable enough.


Linear model - TF reporter grammar

Aim: If the measured activities can be fitted by a linear model, it is likely that the activity is true. If not, we might be looking at noise instead.

Linear models seem to work very well for most TFs.

Identify best condition per TF based on linear models and other data

Explore linear models for all TFs in the chosen conditions

Aim: Which design features are important for each TF? Are there general reporter design rules?

Conclusion: The choice of the minimal promoter is important for most TFs - all TFs drive higher transcription from minCMV. Spacer sequences seem to be very TF-specific.

Reporter quality score computation

Aim: We want to find for each TF the optimal reporters. Here I combine all data to compute reporter quality scores to find the best reporters for each TF.

## Combine all data into one data table
reference_activities <- cDNA_df %>%
  filter(neg_ctrls == "No", 
         hPGK == "No", 
         str_detect(tf, "RANDOM", negate = T), 
         native_enhancer == "No") %>%
  distinct(tf, condition, reporter_id, commercial_reporter, reporter_activity_minP) %>%
  mutate(reporter_activity_minP = log2(reporter_activity_minP)) %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  mutate(tf_condition = paste(tf, condition, sep = "_")) %>%
  filter(tf_condition %in% chosen_conditions$tf_condition) %>%
  dplyr::select(-tf_condition)

cDNA_df6 <- reference_activities %>%
  left_join(rna_correlations) %>%
  left_join(off_target_activities %>% dplyr::select(reporter_id, tf, commercial_reporter, off_target_activity, 
                                                    "off_target_perturbation" = condition, effect_size)) %>%
  mutate(off_target_activity = ifelse(effect_size == 1, -off_target_activity, off_target_activity)) %>%
  mutate(off_target_activity = ifelse(off_target_activity > 0 & !is.na(off_target_activity), 0, off_target_activity)) %>%
  dplyr::select(-effect_size) %>%
  left_join(on_target_activities %>% dplyr::select(reporter_id, tf, commercial_reporter, reporter_dif_minP, 
                                                   "perturbation_condition" = condition, effect_size)) %>%
  mutate(reporter_dif_minP = ifelse(effect_size == 0, -reporter_dif_minP, reporter_dif_minP)) %>%
  dplyr::select(-effect_size) %>%
  rowwise() %>%
  mutate(quality_score = sum(c_across(c("reporter_dif_minP", "reporter_activity_minP", "cor_pval", "off_target_activity")), na.rm = T)) %>%
  mutate(quality_score = ifelse(quality_score < 0, 0, quality_score))
## Joining, by = "reporter_id"
## Joining, by = c("reporter_id", "tf", "commercial_reporter")
## Joining, by = c("reporter_id", "tf", "commercial_reporter")
lm_pval <- cDNA_df_TF4 %>%
  group_by(tf) %>%
  dplyr::arrange(pval_adj, .by_group = T) %>%
  dplyr::slice(n = 1, .preserve = T) %>%
  distinct(tf, pval_adj)

## Compute confidence levels based on thresholds
cDNA_df_confidence2 <- cDNA_df6 %>%
  group_by(tf) %>%
  mutate(active = ifelse(reporter_activity_minP > 1, "Yes", "No")) %>%
  ungroup() %>%
  mutate(n_active = as.numeric(ave(tf, tf, active, FUN = function(x) length(x)))) %>%
  mutate(n_active = ifelse(active == "No", 0, n_active)) %>%
  mutate(n_active = ave(n_active, tf, FUN = max)) %>%
  left_join(lm_pval) %>%
  ## Add a level if reporter activities can be predicted by log-linear model
  mutate(conf_level = ifelse((commercial_reporter == "No" & pval_adj < 0.05 & reporter_activity_minP > 1) |
                               (commercial_reporter == "No" & n_active > 12 & reporter_activity_minP > 1) |
                               (commercial_reporter == "Yes" & reporter_activity_minP > 1), 1, 0)) %>% 
  ## Add a level if reporter activity follows transcript/protein levels
  mutate(conf_level = ifelse(cor_pval > 1 & !is.na(cor_pval), conf_level + 1, conf_level)) %>% 
  ## Add a level if TF perturbation leads to change in activity
  mutate(conf_level = ifelse(reporter_dif_minP > 1 & !is.na(reporter_dif_minP), conf_level + 2, conf_level)) %>% 
  ## Subtract a level if substantial off-target response
  mutate(conf_level = ifelse(conf_level >= 1 & off_target_activity < -1 & !is.na(off_target_activity), conf_level - 1, conf_level))
## Joining, by = "tf"
reporter_features <- cDNA_df %>%
  distinct(promoter, spacing, distance, background, tf, reporter_id)

# ## Figure S6
# plot_list_print <- list()
# for (i in unique(cDNA_df_confidence2$tf)){
#   
#   ## Generate a data frame with all reporters for a given TF
#   cDNA_df_confidence_example <- cDNA_df_confidence2 %>%
#     filter(tf == i) %>%
#     distinct(reporter_id, cor_pval, reporter_dif_minP, reporter_activity_minP, off_target_activity, 
#              quality_score, conf_level, pval_adj, commercial_reporter, perturbation_condition, off_target_perturbation,
#              condition, tf)
#   
#   ## Only do this for TFs with at least 10 reporters
#   if (nrow(cDNA_df_confidence_example) > 10) {
#     
#   ## Order reporters by confidence level and quality score
#   y_order <- cDNA_df_confidence_example %>%
#     ## sort by confidence level and then by quality score
#     arrange(desc(conf_level), 
#             desc(quality_score)) %>%
#     distinct(reporter_id)
#   
#   ## Plot the confidence levels of each reporter on top
#   confidence_colors <- c(`0` = "#CCCCCB", `1` = "#EFCABE", `2` = "#E7A08B", `3` = "#DD6840", `4` = "#BF4A27")
#     
#   plot_scores2 <- cDNA_df_confidence_example %>%
#     distinct(reporter_id, conf_level) %>%
#     column_to_rownames("reporter_id")
#   
#   plot_scores3 <- plot_scores2 %>%
#     as.matrix() %>%
#     t() %>%
#     .[,y_order$reporter_id] %>%
#     as.matrix() %>%
#     t()
# 
#   n_conf_levels <- length(unique(cDNA_df_confidence_example$conf_level))
#   
#   rownames(plot_scores3) <- "Confidence level"
#   
#   plot_1 <- pheatmap(plot_scores3, cluster_rows = F, cluster_cols = T, show_rownames = F, show_colnames = F, 
#                   breaks = c(-1, 0, 1, 2, 3), color = confidence_colors, clustering_method = "ward.D2",
#                   legend = F, cellwidth = 10, cellheight = 10, 
#                   fontsize = 10, cutree_cols = n_conf_levels, treeheight_col = 0)
#   
#   # Extract hierarchical clustering from plot_scores3
#   hc <- plot_1[["tree_col"]]
#   
#   hc$order <- 1:ncol(plot_scores3)
# 
#   plot_1 <- pheatmap(plot_scores3, cluster_rows = F, cluster_cols = hc, show_rownames = F, show_colnames = F, 
#                   breaks = c(-1, 0, 1, 2, 3), color = confidence_colors, clustering_method = "ward.D2",
#                   legend = F, cellwidth = 10, cellheight = 10, 
#                   fontsize = 10, cutree_cols = n_conf_levels, treeheight_col = 0, border_color = "black")
#   
#   hc <- plot_1[["tree_col"]]
#   
#   ## Plot the actual scores underlying the confidence levels underneath
#   
#   x_order <- c("Reporter activity", "TPM correlation", "Perturbation FC", "Off-target activity")
#   
#   plot_activities2 <- cDNA_df_confidence_example %>%
#          dplyr::select(-conf_level, -quality_score, -commercial_reporter, -off_target_perturbation, -perturbation_condition,
#                        -condition, -tf, reporter_id, "Reporter activity" = reporter_activity_minP,
#                        "TPM correlation" = cor_pval, "Off-target activity" = off_target_activity, "Perturbation FC" = reporter_dif_minP,
#                        pval_adj) %>%
#          pivot_longer(cols = -reporter_id, 
#                       names_to = "feature", values_to = "score") %>%
#          filter(feature != "pval_adj") %>%
#          pivot_wider(names_from = feature, values_from = score) %>%
#          column_to_rownames("reporter_id") %>%
#     t() %>%
#     .[,y_order$reporter_id] %>%
#     .[x_order,]
# 
#   plot_2 <- pheatmap(plot_activities2, cluster_rows = F, cluster_cols = hc, show_rownames = F, show_colnames = F, 
#                   color = colorRampPalette(c("#2166AC", "white", "#DD6940"))(100), 
#                   breaks = c(seq(-2, 0, length.out=ceiling(100/2) + 1), seq(4/100, 4, length.out=floor(100/2))), 
#                   legend_breaks = seq(-2, 4, length.out = 100), legend = F, cellwidth = 10, cellheight = 10, 
#                   fontsize = 10, treeheight_col = 0, cutree_cols = n_conf_levels, border_color = "black")
#   
#   ## Now plot the features of the reporters at the bottom
#   
#   x_order2 <- c("Promoter", "Spacer length", "Spacer sequence", "Distance", "Published/synthetic")
#   reporter_features$distance[reporter_features$distance == "10bp"] <- "10bps"
#   
#   feature_colors2 <- data.frame(feature = c("hBGm", "mCMV", "minP", "Random", "1", "2", "3", "21bp", "10bps",
#                                             "5bp", "10bp", "romanov", "TF-seq", "long", "promega", 
#                                             "TF-seq2", "TF-seq3", "promega2long", "promega2", ""),
#                                 feature2 = 1:20)
# 
#   plot_features2 <- cDNA_df_confidence_example %>%
#          left_join(reporter_features %>%  
#                     mutate(commercial_reporter_source = gsub(".*(_.*)", "\\1", tf)) %>%
#                      mutate(commercial_reporter_source = ifelse(tf == "SRF_promega2_long", "_promega2long", 
#                                                                 commercial_reporter_source)) %>%
#                      mutate(commercial_reporter_source = gsub("_", "", commercial_reporter_source)) %>%
#                      #mutate(commercial_reporter_source = ifelse(commercial_reporter_source == "promega2", "promega", commercial_reporter_source)) %>%
#                      mutate(commercial_reporter_source = ifelse(commercial_reporter_source == "TF-romanov", "romanov", commercial_reporter_source)) %>%
#                      mutate(commercial_reporter_source = ifelse(commercial_reporter_source %in% c("promega", "TF-seq", "romanov",
#                                                                                                   "TF-seq2", "TF-seq3", "promega2",
#                                                                                                   "long", "promega2long"),
#                                                                 commercial_reporter_source, "")) %>%
#                      dplyr::select(-tf)) %>% 
#          mutate(background = as.character(background)) %>%
#          pivot_longer(cols = c("promoter", "spacing", "background", "distance", "commercial_reporter_source"), 
#                       names_to = "feature_name", values_to = "feature") %>%
#     left_join(feature_colors2) %>%
#     dplyr::select(-feature) %>%
#     distinct() %>%
#          pivot_wider(names_from = feature_name, values_from = feature2) %>%
#          column_to_rownames("reporter_id") %>%
#         dplyr::select("Promoter" = promoter, "Spacer length" = spacing,  "Spacer sequence" = background, 
#                       "Distance" = distance, "Published/synthetic" = commercial_reporter_source) %>%
#     distinct() %>%
#     t() %>%
#     .[,y_order$reporter_id] %>%
#     .[x_order2,]
#   
#   breaks <- c(0:20)
#   colors <- c("#B4B4B4","#231F20","#CCCCCC","#F3F3F3","#EFCABD","#E7A08B","#DD6C48","#A4A48C","#D1D0C3",
#               "#BFD0D9","#7D9EB2","black","black","black","black","black","black","black","black","white") 
#   
#   plot_3 <- pheatmap(plot_features2, cluster_rows = F, cluster_cols = hc, show_rownames = F, show_colnames = F, 
#                   color = colors, legend = F, cellwidth = 10, cellheight = 10, breaks = breaks,
#                   fontsize = 10, treeheight_col = 0, cutree_cols = n_conf_levels, na_col = "white", border_color = "black")
# 
#   
#   ## Add some labeling at the very top
#   
#   cDNA_df_confidence_example <- cDNA_df_confidence_example %>%
#     mutate(condition2 = gsub("_.*", "", condition))
#   
#   title <- grid.text(
#   label = paste("TF = ", unique(cDNA_df_confidence_example$tf), "\n",
#                 "Perturbation Condition = ", unique(cDNA_df_confidence_example$perturbation_condition), "\n",
#                 "Off target perturbation condition = ", unique(cDNA_df_confidence_example$off_target_perturbation), "\n",
#                 "Cell type = ", unique(cDNA_df_confidence_example$condition2), "\n"),
#   x = 0.5,
#   y = 0.97,
#   just = "top",
#   gp = gpar(fontsize = 10)
# )
#   
#   
#   ## Combine the three pheatmaps into one plot
#   plot_list=list()
#   plot_list[[1]] <- plot_1[[4]]
#   plot_list[[2]] <- plot_2[[4]]
#   plot_list[[3]] <- plot_3[[4]]
#   
#   ## Create a list that contains all grids
#   plot_list_print[[i]] <- grid.arrange(title, arrangeGrob(grobs = plot_list, ncol=1), heights=c(1, 3, .5, .5))
#   
#   }
# }
# 
# # Generate a pdf that contains all the plots
# pdf("figures/quality_scores.pdf", width = 8.3, height = 3.9)
# for (i in names(plot_list_print)) {
#   grid.newpage()
#   grid.draw(plot_list_print[[i]])
# }
# dev.off()


## Plot number of reporters and confidence levels per TF
ggplot(cDNA_df_confidence2 %>%
         distinct(tf, reporter_id, conf_level, commercial_reporter) %>%
         mutate(max_conf_level = ave(conf_level, tf, FUN = max)) %>%
         filter(max_conf_level > 1) %>%
         mutate(conf_level_sum = ave(conf_level, tf, FUN = sum)) %>% 
         mutate(n_reporters = as.numeric(ave(reporter_id, tf, FUN = length))) %>%
         filter(n_reporters > 10) %>%
         mutate(average_level = conf_level_sum / n_reporters) %>%
         mutate(conf_level = factor(conf_level, levels = c("4", "3", "2", "1", "0"))) %>% 
         na.omit() %>%
         mutate(total_count = as.numeric(ave(tf, tf, commercial_reporter, FUN = length))) %>%
         mutate(conf_count = as.numeric(ave(tf, tf, conf_level, commercial_reporter, FUN = length))) %>%
         mutate(rel_count = conf_count/total_count) %>%
         distinct(tf, conf_level, rel_count, average_level, max_conf_level, commercial_reporter),
       aes(x = reorder(tf, -max_conf_level), y = rel_count, fill = conf_level)) +
  geom_bar(stat = "identity", position = "stack") +
  theme_pubr(x.text.angle = 90) + 
  scale_fill_manual(values = c("0" = "#CCCCCB", "1" = "#EFCABE", "2" = "#E7A08B", "3" = "#DD6840", "4" = "#BF4A27")) +
  facet_wrap(vars(commercial_reporter), ncol = 1)

## Figure S5B: Plot quality scores per TF (export 5x12)
ggplot(cDNA_df_confidence2 %>%
         distinct(tf, reporter_id, conf_level, quality_score, commercial_reporter) %>%
         mutate(conf_level_sum = ave(conf_level, tf, FUN = sum)) %>% 
         mutate(n_reporters = as.numeric(ave(reporter_id, tf, FUN = length))) %>%
         filter(n_reporters > 10) %>%
         mutate(average_level = conf_level_sum / n_reporters) %>%
         mutate(conf_level = factor(conf_level, levels = c("4", "3", "2", "1", "0"))) %>% 
         na.omit(),
       aes(x = reorder(tf, -average_level), y = quality_score, color = conf_level, shape = commercial_reporter)) +
  geom_quasirandom() +
  theme_pubr(x.text.angle = 90) + 
  scale_color_manual(values = c("0" = "#CCCCCB", "1" = "#EFCABE", "2" = "#E7A08B", "3" = "#DD6840", "4" = "#BF4A27"))

Plot activities in chosen conditions

Aim: Get a good overview of reporter activities

cell_colors <- c("A549" = "#f2cc8f", "HCT116" = "#babf95", "HEK293" = "#f4f1de", 
                               "HepG2" = "#5f797b", "K562" = "#8f5d5d", "MCF7" = "#81b29a", 
                               "U2OS" = "#3d405b", "mES" = "#eab69f", "NPC" = "#e07a5f") %>%
  as.data.frame() %>%
  rownames_to_column("cell")

## Quasirandom plots of reporter activities -- combined plot with best cell line per TF
cDNA_df_plot <- cDNA_df6 %>%
  distinct(tf, reporter_id, reporter_activity_minP, condition) %>%
  mutate(cell = gsub("_.*", "", condition)) %>%
  mutate(mean_tf_activity = ave(reporter_activity_minP, tf, FUN = mean)) %>%
  mutate(cell = factor(cell, levels = c("A549", "HCT116", "HEK293", "HepG2", "K562",
                                        "MCF7", "U2OS", "mES", "NPC"))) %>%
  mutate(n_tf = as.numeric(ave(tf, tf, FUN = length))) %>%
  filter(n_tf > 10)

tfs_colors <- cDNA_df_plot %>%
  distinct(tf, cell, mean_tf_activity) %>%
  arrange(desc(mean_tf_activity)) %>%
  left_join(cell_colors)
## Joining, by = "cell"
tfs_colors_vec <- tfs_colors %>%
  column_to_rownames("tf") %>%
  dplyr::select(-cell, -mean_tf_activity) %>%
  t() %>%
  base::as.vector()

names(tfs_colors_vec) <- tfs_colors$tf


# ggplot(cDNA_df_plot,
#        aes(x = reorder(tf, -mean_tf_activity), y = reporter_activity_minP)) +
#   geom_hline(yintercept = 1, lty = 3) + 
#   geom_hline(yintercept = -1, lty = 3) + 
#   geom_quasirandom(width = .2, aes(color = ifelse(abs(reporter_activity_minP) > 1, "black", "grey70")),
#                    alpha = .6) +
#   stat_summary(fun = median, fun.min = median, fun.max = median, geom = "crossbar", color = "red", width = 0.5, lwd = 0.4) +
#   theme_pubr(x.text.angle = 90) +
#   scale_color_manual(values = c("black", "grey70")) +
#   theme(axis.text.x = element_text(colour=tfs_colors_vec))

## Same but just show two cell lines
cDNA_df_plot <- cDNA_df %>%  
  filter(neg_ctrls == "No",
         commercial_reporter == "No",
         hPGK == "No", 
         str_detect(tf, "RANDOM", negate = T), 
         native_enhancer == "No") %>%
  distinct(tf, reporter_id, reporter_activity_minP, condition) %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  mutate(cell = gsub("_.*", "", condition)) %>%
  mutate(reporter_activity_minP = log2(ave(reporter_activity_minP, reporter_id, cell, FUN = mean))) %>%
  distinct(tf, reporter_id, reporter_activity_minP, cell) %>%
  mutate(n_tf = as.numeric(ave(tf, tf, cell, FUN = length))) %>%
  filter(n_tf >= 20) %>%
  filter(cell %in% c("NPC", "mES"))

cDNA_df_plot2 <- cDNA_df_plot %>%
  filter(cell == "NPC") %>%
  mutate(mean_tf_activity = ave(reporter_activity_minP, tf, cell, FUN = mean)) %>%
  distinct(tf, mean_tf_activity)

cDNA_df_plot <- cDNA_df_plot %>%
  left_join(cDNA_df_plot2)
## Joining, by = "tf"
## Figure 1E
ggplot(cDNA_df_plot %>%
           mutate(cell = factor(cell, levels = c("NPC", "mES"))),
       aes(x = reorder(tf, -mean_tf_activity), y = reporter_activity_minP)) +
    geom_hline(yintercept = 1, lty = 3) + 
    geom_hline(yintercept = -1, lty = 3) + 
    geom_quasirandom_rast(width = .2, shape = 21, stroke = NA, aes(fill = ifelse(abs(reporter_activity_minP) > 1, "black", "grey70")),
                          alpha = .6, size = 2, raster.dpi = 600) +
    stat_summary(fun = median, fun.min = median, fun.max = median, geom = "crossbar", color = "red", width = 0.5, lwd = 0.4) +
    theme_classic_lines_90() +
    scale_fill_manual(values = c("black", "grey70")) +
    facet_wrap(~cell, nrow = 2) +
  ggtitle("Figure 1E: Reporter activity in NPC and mES cells")

ggplot(cDNA_df_plot %>%
         mutate(mean_tf_activity_cell = ave(reporter_activity_minP, tf, cell, FUN = mean)) %>%
           mutate(cell = factor(cell, levels = c("NPC", "mES"))) %>%
         distinct(mean_tf_activity, tf, cell, mean_tf_activity_cell),
       aes(x = reorder(tf, -mean_tf_activity), y = mean_tf_activity_cell)) +
    geom_hline(yintercept = 1, lty = 3) + 
    geom_hline(yintercept = -1, lty = 3) + 
    geom_bar(stat = "identity", aes(fill = ifelse(abs(mean_tf_activity_cell) > 1, "black", "grey70"))) +
    theme_classic_lines_90() +
    scale_fill_manual(values = c("black", "grey70")) +
    facet_wrap(~cell, nrow = 2)

Conclusion: There are some marked differences in activities between mES and NPC.

Showcase how reporter activities change upon perturbation

Aim: Demonstrate perturbation responses and differences between reporter designs.

## Prepare data frame for following analyses
cDNA_df2 <- cDNA_df %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  dplyr::filter(neg_ctrls == "No", 
                #commercial_reporter == "No", 
                hPGK == "No", 
                str_detect(tf, "RANDOM", negate = T), 
                native_enhancer == "No") %>%
  distinct(tf, condition, stimulation, reference_condition, tf_target, effect_size, off_target,
           cell, reporter_id, commercial_reporter, reporter_activity_minP, gcf, reporter_dif_minP) %>%
  mutate(reporter_activity_minP = log2(reporter_activity_minP)) %>%
  distinct() %>% 
  filter(tf_target == 1) %>%
  mutate(effect_size = as.numeric(effect_size))

### Selected conditions
selected_conditions <- cDNA_df_confidence2 %>%
  mutate(selected_perturbations = paste(tf, perturbation_condition, sep = "_")) %>%
  mutate(selected_off_perturbations = paste(tf, off_target_perturbation, sep = "_"))

### Figure 4A: All direct TF perturbations
ggplot(cDNA_df2 %>%
         filter(stimulation %in% c("KD", "overexpression", "degron")) %>%
         filter(!condition %in% c("mES_NR4A2-OE-cDIM12", "mES_OTX2-OE")) %>%
         distinct(tf, reporter_id, reporter_dif_minP, commercial_reporter, condition) %>%
         mutate(n_reporters = as.numeric(ave(reporter_id, tf, FUN = length))) %>%
         filter(n_reporters > 10) %>%
         mutate(tf = paste(tf, condition, sep = "_")) %>%
         mutate(selected = ifelse(tf %in% selected_conditions$selected_perturbations, "Yes", "No")) %>%
         #filter(selected == "Yes") %>%
         filter(tf != "RXRA_mES_NR4A2-OE") %>%
         mutate(dif_mean = ave(reporter_dif_minP, tf, FUN = function(x) mean(x, na.rm = T))),
       aes(x = reorder(tf, -dif_mean), y = reporter_dif_minP, shape = commercial_reporter)) +
  geom_hline(yintercept = 0, lty = 2) +
  geom_hline(yintercept = 1, lty = 1) +
  geom_hline(yintercept = -1, lty = 1) +
  geom_quasirandom(aes(color = ifelse(abs(reporter_dif_minP) > 1, "Yes", "No"))) +
  scale_color_manual(values = c("Yes" = "grey20", "No" = "grey90")) +
  theme_pubr(x.text.angle = 90, legend = "none") +
  ggtitle("Figure 4A: Reporter activity upon direct TF perturbations")
## Warning: Removed 25 rows containing missing values (`position_quasirandom()`).

### Figure 3A: All pathway perturbations
ggplot(cDNA_df2 %>%
         filter((stimulation %in% c("pathway")) | (condition == "mES_NR4A2-OE-cDIM12")) %>%
         mutate(selected = ifelse(tf %in% selected_conditions$selected_perturbations, "Yes", "No")) %>%
         distinct(tf, reporter_id, reporter_dif_minP, commercial_reporter, condition) %>%
         mutate(n_reporters = as.numeric(ave(reporter_id, tf, FUN = length))) %>%
         filter(n_reporters > 10) %>%
         mutate(tf = paste(tf, condition, sep = "_")) %>%
         mutate(selected = ifelse(tf %in% selected_conditions$selected_perturbations, "Yes", "No")) %>%
         #filter(selected == "Yes") %>%
         filter(!tf %in% c("RXRA_mES_NR4A2-OE-cDIM12", "RXRA_U2OS_Calcitriol")) %>%
         mutate(dif_mean = ave(reporter_dif_minP, tf, FUN = function(x) mean(x, na.rm = T))),
       aes(x = reorder(tf, -dif_mean), y = reporter_dif_minP, shape = commercial_reporter)) +
  geom_hline(yintercept = 0, lty = 2) +
  geom_hline(yintercept = 1, lty = 1) +
  geom_hline(yintercept = -1, lty = 1) +
  geom_quasirandom(aes(color = ifelse(abs(reporter_dif_minP) > 1, "Yes", "No"))) +
  scale_color_manual(values = c("Yes" = "grey20", "No" = "grey90")) +
  theme_pubr(x.text.angle = 90, legend = "none") +
  ggtitle("Figure 3A: Reporter activity upon pathway perturbations")
## Warning: Removed 7 rows containing missing values (`position_quasirandom()`).

### Figure S4: All off-target perturbations
cDNA_df2 <- cDNA_df %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  dplyr::filter(neg_ctrls == "No", 
                #commercial_reporter == "No", 
                hPGK == "No", 
                str_detect(tf, "RANDOM", negate = T), 
                native_enhancer == "No") %>%
  distinct(tf, condition, stimulation, reference_condition, tf_target, effect_size, off_target,
           cell, reporter_id, commercial_reporter, reporter_activity_minP, gcf, reporter_dif_minP) %>%
  mutate(reporter_activity_minP = log2(reporter_activity_minP)) %>%
  distinct() %>% 
  filter(tf_target == 2) %>%
  mutate(effect_size = as.numeric(effect_size))

ggplot(cDNA_df2 %>%
         filter(stimulation %in% c("KD", "overexpression", "degron")) %>%
         filter(!condition %in% c("mES_NR4A2-OE-cDIM12", "mES_OTX2-OE")) %>%
         distinct(tf, reporter_id, reporter_dif_minP, commercial_reporter, condition) %>%
         mutate(n_reporters = as.numeric(ave(reporter_id, tf, condition, FUN = length))) %>%
         #filter(n_reporters > 10) %>%
         mutate(tf = paste(tf, condition, sep = "_")) %>%
         mutate(selected = ifelse(tf %in% selected_conditions$selected_off_perturbations, "Yes", "No")) %>%
         #filter(selected == "Yes") %>%
         filter(tf != "RXRA_mES_NR4A2-OE") %>%
         mutate(dif_mean = ave(reporter_dif_minP, tf, FUN = function(x) mean(x, na.rm = T))),
       aes(x = reorder(tf, -dif_mean), y = reporter_dif_minP, shape = commercial_reporter)) +
  geom_hline(yintercept = 0, lty = 2) +
  geom_hline(yintercept = 1, lty = 1) +
  geom_hline(yintercept = -1, lty = 1) +
  geom_quasirandom(aes(color = ifelse(abs(reporter_dif_minP) > 1, "Yes", "No"))) +
  scale_color_manual(values = c("Yes" = "grey20", "No" = "grey90")) +
  theme_pubr(x.text.angle = 90, legend = "none") +
  ggtitle("Figure S4: Reporter activity upon off-target TF perturbations")
## Warning: Removed 21 rows containing missing values (`position_quasirandom()`).

ggplot(cDNA_df2 %>%
         filter((stimulation %in% c("pathway")) | (condition == "mES_NR4A2-OE-cDIM12")) %>%
         distinct(tf, reporter_id, reporter_dif_minP, commercial_reporter, condition) %>%
         mutate(n_reporters = as.numeric(ave(reporter_id, tf, FUN = length))) %>%
         #filter(n_reporters > 10) %>%
         mutate(tf = paste(tf, condition, sep = "_")) %>%
         mutate(selected = ifelse(tf %in% selected_conditions$selected_off_perturbations, "Yes", "No")) %>%
         #filter(selected == "Yes") %>%
         mutate(dif_mean = ave(reporter_dif_minP, tf, FUN = function(x) mean(x, na.rm = T))),
       aes(x = reorder(tf, -dif_mean), y = reporter_dif_minP, shape = commercial_reporter)) +
  geom_hline(yintercept = 0, lty = 2) +
  geom_hline(yintercept = 1, lty = 1) +
  geom_hline(yintercept = -1, lty = 1) +
  geom_quasirandom(aes(color = ifelse(abs(reporter_dif_minP) > 1, "Yes", "No"))) +
  scale_color_manual(values = c("Yes" = "grey20", "No" = "grey90")) +
  theme_pubr(x.text.angle = 90, legend = "none") +
  ggtitle("Figure S4: Reporter activity upon off-target pathway perturbations")
## Warning: Removed 85 rows containing missing values (`position_quasirandom()`).

## Figure S4: Plot perturbation response of NR1H4 reporters to all perturbations
ggplot(cDNA_df %>%
         mutate(tf = gsub("_.*", "", tf)) %>%
         filter(tf == "NR1H4::RXRA", stimulation != "no") %>%
         distinct(tf, reporter_id, reporter_dif_minP, commercial_reporter, condition, effect_size) %>%
         drop_na(effect_size) %>%
         mutate(reporter_dif_minP = ifelse(effect_size == 0, -reporter_dif_minP, reporter_dif_minP)) %>%
         mutate(dif_mean = ave(reporter_dif_minP, tf, condition, FUN = function(x) mean(x, na.rm = T))),
       aes(x = reorder(condition, -dif_mean), y = reporter_dif_minP, shape = commercial_reporter)) +
  geom_hline(yintercept = 0, lty = 2) +
  geom_hline(yintercept = 1, lty = 1) +
  geom_hline(yintercept = -1, lty = 1) +
  geom_quasirandom(aes(color = ifelse(abs(reporter_dif_minP) > 1, "Yes", "No"))) +
  scale_color_manual(values = c("Yes" = "grey20", "No" = "grey90")) +
  theme_pubr(x.text.angle = 90, legend = "none") +
  ggtitle("Figure S4: Reporter activity upon NR1H4 perturbations")
## Warning: Removed 1855 rows containing missing values (`position_quasirandom()`).

## Showcase examples
### TF activation
#### VDR
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "VDR"), condition %in% c("U2OS", "U2OS_Calcitriol"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, condition, reporter_activity_minP, promoter) %>%
         spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(U2OS),
           y = log2(U2OS_Calcitriol), 
           shape = commercial_reporter, 
           color = promoter)) +
  geom_hline(yintercept = 0) + 
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  geom_point(size = 2.5) +
  xlim(-0.75, 8) +
  ylim(-0.75, 8) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("mCMV" = "black", "hBGm" = "grey50", "minP" = "grey80", "Random" = "grey90")) +
  ggtitle("VDR activation")

#### FOXA1
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "FOXA1"), condition %in% c("mES_Dox", "mES_FOXA1-OE"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, spacing, condition, reporter_activity_minP) %>%
         spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(mES_Dox),
           y = log2(`mES_FOXA1-OE`), 
           shape = commercial_reporter,
           color = spacing)) +
  geom_hline(yintercept = 0) + 
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  geom_point(size = 2.5) +
  xlim(-1, 3) +
  ylim(-1, 3) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("#7C9EB2", "#BFD0D9")) +
  ggtitle("FOXA1 activation")
## Warning: Removed 2 rows containing missing values (`geom_point()`).

#### HSF1
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "HSF1"), condition %in% c("U2OS", "U2OS_Heat"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, promoter, condition, background, reporter_activity_minP) %>%
         spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(U2OS),
           y = log2(U2OS_Heat), 
           shape = commercial_reporter,
           color = as.factor(promoter))) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  geom_point(size = 2.5) +
  xlim(-0.75, 8) +
  ylim(-0.75, 8) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("mCMV" = "black", "hBGm" = "grey50", "minP" = "grey80", "Random" = "grey90")) +
  ggtitle("HSF1 activation")

#### CREB1
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "CREB1"), condition %in% c("mES_2i_LIF", "mES_FK"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, spacing, condition, background, reporter_activity_minP) %>%
         spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(mES_2i_LIF),
           y = log2(mES_FK), 
           shape = commercial_reporter,
           color = as.factor(spacing))) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  geom_point(size = 2.5) +
  xlim(-1.25, 3) +
  ylim(-1.25, 3) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("#7C9EB2", "#BFD0D9")) +
  ggtitle("CREB1 activation")
## Warning: Removed 2 rows containing missing values (`geom_point()`).

#### NR1H4
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "NR1H4::RXRA"), condition %in% c("HepG2", "HepG2_CDCA"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, promoter, condition, background, reporter_activity_minP) %>%
         spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(HepG2),
           y = log2(HepG2_CDCA), 
           shape = commercial_reporter,
           color = as.factor(promoter))) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  geom_point(size = 2.5) +
  xlim(-0.75, 5) +
  ylim(-0.75, 5) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("mCMV" = "black", "hBGm" = "grey50", "minP" = "grey80", "Random" = "grey90")) +
  ggtitle("NR1H4 activation")

#### NFE2L2
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "NFE2L2"), condition %in% c("mES_2i_LIF", "mES_HQ"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, spacing, condition, background, reporter_activity_minP) %>%
         spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(mES_2i_LIF),
           y = log2(mES_HQ), 
           shape = commercial_reporter,
           color = as.factor(spacing))) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  geom_point(size = 2.5) +
  xlim(-0.75, 7) +
  ylim(-0.75, 7) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("#7C9EB2", "#BFD0D9")) +
  ggtitle("NFE2L2 activation")

#### TP53
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "TP53"), gcf == "gcf6210", condition %in% c("A549_Nutlin"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, spacing, condition, background, reporter_activity_minP, reference_activity_gcf),
       aes(x = reference_activity_gcf,
           y = log2(reporter_activity_minP), 
           shape = commercial_reporter,
           color = as.factor(spacing))) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  geom_point(size = 2.5) +
  xlim(-0.75, 7) +
  ylim(-0.75, 7) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("#7C9EB2", "#BFD0D9")) +
  ggtitle("TP53 activation")
## Warning: Removed 11 rows containing missing values (`geom_point()`).

### TF inactivation
#### STAT3
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "STAT3"), condition %in% c("mES_2i_LIF", "mES_2i"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, promoter, condition, spacing, reporter_activity_minP) %>%
         #mutate(reporter_activity_minP = ave(reporter_activity_minP, reporter_id, condition, FUN = mean)) %>%
         #distinct() %>%
         spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(mES_2i_LIF),
           y = log2(mES_2i), 
           shape = commercial_reporter,
           color = as.factor(spacing))) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  geom_point(size = 2.5) +
  xlim(-0.75, 5) +
  ylim(-0.75, 5) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("#7C9EB2", "#BFD0D9")) +
  ggtitle("STAT3 inactivation")

#### MTF1
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "MTF1"), condition %in% c("HepG2_NT", "HepG2_MTF1"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, spacing, condition, reporter_activity_minP) %>%
         spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(HepG2_NT),
           y = log2(HepG2_MTF1), 
           shape = commercial_reporter,
           color = spacing)) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  geom_point(size = 2.5) +
  xlim(-1, 5) +
  ylim(-1, 5) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none")+
  scale_color_manual(values = c("#7C9EB2", "#BFD0D9")) +
  ggtitle("MTF1 inactivation")
## Warning: Removed 3 rows containing missing values (`geom_point()`).

#### XBP1
ggplot(cDNA_df %>%
         filter(str_detect(tf, "XBP1"), condition %in% c("HepG2_XBP1", "HepG2_NT"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, spacing, condition, background, reporter_activity_minP) %>%
         spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(HepG2_NT),
           y = log2(HepG2_XBP1),
           shape = commercial_reporter, 
           color = spacing)) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  xlim(-.5,6) +
  ylim(-.5,6) +
  geom_point(size = 2.5) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none")+
  scale_color_manual(values = c("#7C9EB2", "#BFD0D9")) +
  ggtitle("XBP1 inactivation")
## Warning: Removed 1 rows containing missing values (`geom_point()`).

#### PAX6
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "PAX6"), condition %in% c("HepG2_PAX6", "HepG2_NT"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, spacing, condition, background, reporter_activity_minP) %>%
         spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(HepG2_NT),
           y = log2(HepG2_PAX6), 
           shape = commercial_reporter,
           color = spacing)) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  xlim(-.5,5) +
  ylim(-.5,5) +
  geom_point(size = 2.5) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("#7C9EB2", "#BFD0D9")) +
  ggtitle("PAX6 inactivation")
## Warning: Removed 1 rows containing missing values (`geom_point()`).

### POU5F1::SOX2
ggplot(cDNA_df %>% 
  filter(str_detect(tf, "POU5F1::SOX2"), condition %in% c("mES_Sox2_AID", "mES_Pou5f1_AID", "mES_Sox2_ctrl", "mES_Pou5f1_ctrl"), 
                neg_ctrls == "No") %>%
  distinct(reporter_id, commercial_reporter, spacing, condition, background, reporter_activity_minP) %>%
  spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(mES_Sox2_ctrl),
           y = log2(mES_Sox2_AID), 
           shape = commercial_reporter,
           color = spacing)) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  xlim(-.5,5) +
  ylim(-.5,5) +
  geom_point(size = 2.5) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("#7C9EB2", "#BFD0D9")) +
  ggtitle("POU5F1::SOX2 inactivation")

ggplot(cDNA_df %>% 
  filter(str_detect(tf, "POU5F1::SOX2"), condition %in% c("mES_Sox2_AID", "mES_Pou5f1_AID", "mES_Sox2_ctrl", "mES_Pou5f1_ctrl"), 
                neg_ctrls == "No") %>%
  distinct(reporter_id, commercial_reporter, spacing, condition, background, reporter_activity_minP) %>%
  spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(mES_Pou5f1_ctrl),
           y = log2(mES_Pou5f1_AID), 
           shape = commercial_reporter,
           color = spacing)) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  xlim(-.5,5.5) +
  ylim(-.5,5.5) +
  geom_point(size = 2.5) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("#7C9EB2", "#BFD0D9")) +
  ggtitle("POU5F1::SOX2 inactivation")

### Off-target response
#### GRHL1
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "GRHL1"), condition %in% c("mES_NT", "mES_TFCP2L1"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, spacing, condition, reporter_activity_minP) %>%
         spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(mES_NT),
           y = log2(mES_TFCP2L1), 
           shape = commercial_reporter,
           color = spacing)) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  xlim(-.5,5) +
  ylim(-.5,5) +
  geom_point(size = 2.5) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("#7C9EB2", "#BFD0D9")) +
  ggtitle("GRHL1 inactivation upon TFCP2L1-KD")

#### TFCP2L1
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "TFCP2L1"), gcf == "gcf6952", condition %in% c("HepG2_NT", "HepG2_UBP1"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, spacing, condition, reporter_activity_minP_gcf) %>%
         spread(key = "condition", value = "reporter_activity_minP_gcf"),
       aes(x = log2(HepG2_NT),
           y = log2(HepG2_UBP1), 
           shape = commercial_reporter,
           color = spacing)) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_abline(lty = 2) +
  xlim(-.5,5.5) +
  ylim(-.5,5.5) +
  geom_point(size = 2.5) +
  xlab("Activity (log2)") +
  ylab(paste("Perturbation activity (log2)")) +
  theme_pubr(legend = "none") +
  scale_color_manual(values = c("#7C9EB2", "#BFD0D9")) +
  ggtitle("TFCP2L1 inactivation upon UBP1-KD")

#### GRHL1 vs TFCP2L1
grhl1_tfcp2l1 <- cDNA_df %>% 
  filter(tf %in% c("TFCP2L1", "GRHL1"), tf_target == 1 | tf_target == 2, cell == "mES", condition != "mES_TFCP2") %>%
  distinct(tf, reporter_id, reporter_dif_minP, spacing, promoter, background, condition) %>%
  spread(key = "condition", value = "reporter_dif_minP")

ggplot(grhl1_tfcp2l1,
       aes(x = mES_TFCP2L1, y = mES_GRHL1)) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  geom_point(aes(color = spacing, shape = as.factor(background)), size = 2.5) +
  theme_pubr() +
  theme(aspect.ratio = 1) +
  facet_wrap(~tf) +
  scale_color_manual(values = c("#7C9EB2", "#BFD0D9")) +
  ggtitle("GRHL1 vs TFCP2L1 inactivation")

#### TP53
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "TP53"), condition %in% c("HepG2", "HepG2_TP73"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, spacing, condition, promoter, reporter_activity_minP) %>%
         spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(HepG2),
           y = log2(HepG2_TP73), 
           shape = commercial_reporter)) +
  geom_abline(lty = 2) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  xlim(-.75, 11) +
  ylim(-.75, 11) +
  geom_point(size = 2.5, aes(color = promoter)) +
  xlab("Activity (log2)") +
  ylab(paste("Off-target effect (log2)")) +
  theme_pubr(legend = "none") +
  theme(aspect.ratio = 1) + 
  scale_color_manual(values = c("mCMV" = "black", "hBGm" = "grey50", "minP" = "grey80", "Random" = "grey90")) +
  ggtitle("TP53 inactivation upon TP73-KD")

### CLOCK
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "CLOCK"), condition %in% c("mES_2i_LIF", "mES_2i"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, spacing, condition, promoter, reporter_activity_minP) %>%
         spread(key = "condition", value = "reporter_activity_minP"),
       aes(x = log2(mES_2i_LIF),
           y = log2(mES_2i), 
           shape = commercial_reporter)) +
  geom_abline(lty = 2) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  xlim(-.75, 3.5) +
  ylim(-.75, 3.5) +
  geom_point(size = 2.5, aes(color = promoter)) +
  xlab("Activity (log2)") +
  ylab(paste("Off-target effect (log2)")) +
  theme_pubr(legend = "none") +
  theme(aspect.ratio = 1) + 
  scale_color_manual(values = c("mCMV" = "black", "hBGm" = "grey50", "minP" = "grey80", "Random" = "grey90")) +
  ggtitle("CLOCK inactivation upon LIF removal")

### PPARG
ggplot(cDNA_df %>% 
         filter(str_detect(tf, "PPARG::RXRA"), gcf == "gcf7124", condition %in% c("MCF7", "MCF7_Hexestrol"), neg_ctrls == "No") %>%
         distinct(reporter_id, commercial_reporter, spacing, condition, promoter, reporter_activity_minP_gcf) %>%
         spread(key = "condition", value = "reporter_activity_minP_gcf"),
       aes(x = log2(MCF7),
           y = log2(MCF7_Hexestrol), 
           shape = commercial_reporter)) +
  geom_abline(lty = 2) +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = 0) +
  xlim(-1.5, 5.5) +
  ylim(-1.5, 5.5) +
  geom_point(size = 2.5) +
  xlab("Activity (log2)") +
  ylab(paste("Off-target effect (log2)")) +
  theme_pubr(legend = "none") +
  theme(aspect.ratio = 1) +
  ggtitle("PPARG::RXRA activation upon Hexestrol treatment")

Conclusion: Response often depends on reporter design. Synthetic reporters often outperform commercial ones.


Rate performance of synthetic vs. published reporters

Aim: Compare confidence levels of synthetic vs. published reporters.

## First I want to show how well commercial reporters perform compared to synthetic ones
bad_tfs <- cDNA_df_confidence2 %>%
  distinct(tf, conf_level) %>%
  filter(!is.na(conf_level)) %>%
  group_by(tf) %>%
  mutate(removed = conf_level > 0) %>%
  mutate(removed2 = TRUE %in% removed) %>%
  filter(removed2 == TRUE) %>%
  distinct(tf)

commercial_ranking <- cDNA_df_confidence2 %>%
  filter(tf %in% bad_tfs$tf) %>%
  group_by(tf) %>%
  arrange(conf_level, quality_score) %>%
  mutate(rank = row_number()) %>%
  distinct(tf, reporter_id, rank, commercial_reporter, conf_level) %>%
  ## remove tfs for which I don't have commercial reporters
  group_by(tf) %>%
  mutate(commercial = "Yes" %in% commercial_reporter) %>%
  ungroup() %>%
  filter(commercial == TRUE) %>%
  mutate(max_conf_level = ave(conf_level, tf, commercial_reporter, FUN = max))

# ## Figure S5C: Relative rank of published reporters
# ggplot(commercial_ranking %>%
#          filter(commercial_reporter == "Yes") %>%
#          mutate(max_rank2 = ave(rel_rank, tf, FUN = max)),
#        aes(x = reorder(tf, -max_rank2), y = rel_rank)) +
#   geom_point(aes(color = as.factor(conf_level))) +
#   theme_pubr(x.text.angle = 90) +
#   scale_color_manual(values = c("0" = "#CCCCCB", "1" = "#EFCABE", "2" = "#E7A08B", "3" = "#DD6840", "4" = "#BF4A27")) 

## Plot overview figure of commercial vs. synthetic reporters (only best reporter per TF)
commercial_ranking <- cDNA_df_confidence2 %>%
  group_by(tf) %>%
  arrange(conf_level, quality_score) %>%
  mutate(rank = row_number()) %>%
  distinct(tf, reporter_id, rank, commercial_reporter, conf_level) %>%
  ## remove tfs for which I don't have commercial reporters
  group_by(tf) %>%
  mutate(commercial = "Yes" %in% commercial_reporter) %>%
  ungroup() %>%
  #filter(commercial == TRUE) %>%
  ## calculate relative rank
  mutate(max_rank = ave(rank, tf, FUN = max)) %>%
  filter(max_rank > 10) %>%
  mutate(rel_rank = rank/max_rank)

commercial_ranking_best <- commercial_ranking %>%
  mutate(max_rank2 = ave(rank, tf, commercial_reporter, FUN = max)) %>%
  distinct(tf, max_rank2, conf_level, commercial_reporter, rank, commercial) %>%
  filter(rank == max_rank2)

conf_level_synth <- commercial_ranking_best %>%
  filter(commercial_reporter == "No") %>%
  mutate(max_conf_level = ave(conf_level, tf, FUN = max)) %>%
  distinct(tf, max_conf_level)

commercial_ranking_best <- commercial_ranking_best %>%
  left_join(conf_level_synth)

better_synth <- commercial_ranking_best %>%
  filter(commercial == T) %>%
  distinct(max_rank2, tf, commercial_reporter) %>%
  pivot_wider(values_from = "max_rank2", names_from = "commercial_reporter") %>%
  mutate(better_synth = ifelse(No > Yes, "0_Yes", "1_No")) %>%
  distinct(tf, better_synth)

commercial_ranking_best <- commercial_ranking_best %>%
  left_join(better_synth)

commercial_ranking_best_comm <- commercial_ranking_best %>%
  filter(commercial == T) %>%
  mutate(conf_level_sum = ave(conf_level, tf, FUN = sum)) %>%
  arrange(max_conf_level, better_synth, conf_level_sum) %>%
  mutate(tf = factor(tf, levels = unique(tf))) %>%
  mutate(commercial_reporter = factor(commercial_reporter, levels = c("No", "Yes")))

### Figure 6C: TFs where comparison synthetic vs. published is possible
ggplot(commercial_ranking_best_comm,
       aes(y = tf, x = commercial_reporter, fill = as.factor(conf_level))) +
  geom_tile() +
  coord_fixed() +
  theme_pubr(x.text.angle = 90) +
  scale_fill_manual(values = c("0" = "#CCCCCB", "1" = "#EFCABE", "2" = "#E7A08B", "3" = "#DD6840", "4" = "#BF4A27")) +
  scale_size_manual(values = c("Yes" = .35, "No" = 0)) +
  ggtitle("Figure 6C: Commercial vs. Synthetic reporters")

### Figure 6D: TFs with only synthetic
ggplot(commercial_ranking_best %>%
         filter(commercial == F),
       aes(y = reorder(tf, conf_level), x = "x", fill = as.factor(conf_level))) +
  geom_tile() +
  coord_fixed() +
  theme_pubr(x.text.angle = 90) +
  scale_fill_manual(values = c("0" = "#CCCCCB", "1" = "#EFCABE", "2" = "#E7A08B", "3" = "#DD6840", "4" = "#BF4A27")) +
  ggtitle("Figure 6D: TFs with only synthetic reporters")

### TFs with only published
commercial_ranking2 <- cDNA_df_confidence2 %>%
  group_by(tf) %>%
  arrange(conf_level, quality_score) %>%
  mutate(rank = row_number()) %>%
  distinct(tf, reporter_id, rank, commercial_reporter, conf_level) %>%
  ## remove tfs for which I don't have commercial reporters
  group_by(tf) %>%
  mutate(commercial = "Yes" %in% commercial_reporter) %>%
  ungroup() %>%
  #filter(commercial == TRUE) %>%
  ## calculate relative rank
  mutate(max_rank = ave(rank, tf, FUN = max)) %>%
  filter(max_rank < 12) %>%
  mutate(rel_rank = rank/max_rank)

commercial_ranking_best_only_comm <- commercial_ranking2 %>%
  mutate(max_rank2 = ave(rank, tf, commercial_reporter, FUN = max)) %>%
  distinct(tf, max_rank2, conf_level, commercial_reporter, rank, commercial) %>%
  filter(rank == max_rank2)

### Figure 6E: TFs with only published reporters
ggplot(commercial_ranking_best_only_comm,
       aes(y = reorder(tf, conf_level), x = "x", fill = as.factor(conf_level))) +
  geom_tile() +
  coord_fixed() +
  theme_pubr(x.text.angle = 90) +
  scale_fill_manual(values = c("0" = "#CCCCCB", "1" = "#EFCABE", "2" = "#E7A08B", "3" = "#DD6840", "4" = "#BF4A27")) +
  ggtitle("Figure 6E: TFs with only published reporters")

# ### which thresholds do the primer reporters fullfill?
# binary_confidence <- cDNA_df_confidence2 %>%
#   mutate(activity_thresh = ifelse((commercial_reporter == "No" & pval_adj < 0.05 & reporter_activity_minP > 1) |
#                                (commercial_reporter == "No" & n_active > 12 & reporter_activity_minP > 1) |
#                                (commercial_reporter == "Yes" & reporter_activity_minP > 1), 1, 0)) %>%
#   mutate(cor_thresh = ifelse(cor_pval > 1 & !is.na(cor_pval), conf_level + 1, conf_level)) %>%
#   mutate(dif_thresh = ifelse(reporter_dif_minP > 1 & !is.na(reporter_dif_minP), conf_level + 2, conf_level)) %>% 
#   mutate(off_thresh = ifelse(conf_level >= 1 & off_target_activity < -1 & !is.na(off_target_activity), conf_level - 1, conf_level)) %>%
#   distinct(tf, reporter_id, commercial_reporter, conf_level, activity_thresh, cor_thresh, dif_thresh, off_thresh) %>%
#   group_by(tf, commercial_reporter) %>%
#   arrange(desc(conf_level), desc(quality_score)) %>%
#   mutate(rank = row_number()) %>%
#   filter(rank == 1) %>%
#   ungroup() %>%
#   group_by(tf) %>%
#   mutate(max_conf_level = max(conf_level)) %>%
#   filter(max_conf_level > 1)

Conclusion: Many synthetic reporters outperform published reporters.


Extract prime reporter for each TF

## Table S4: Select prime reporter for each TF
best_reporters <- cDNA_df_confidence2 %>%
  group_by(tf) %>%
  arrange(desc(conf_level), desc(quality_score)) %>%
  mutate(rank = row_number()) %>%
  filter(rank == 1) %>%
  ungroup() %>%
  left_join(reporter_features) %>%
  dplyr::select(tf, reporter_id, promoter, spacing, distance, background, commercial_reporter,
           conf_level, quality_score, reporter_activity_minP, condition, cor_pval, reporter_dif_minP,
           perturbation_condition, off_target_activity, off_target_perturbation) %>%
  distinct() %>%
  mutate(condition = gsub("_.*", "", condition))
## Joining, by = c("reporter_id", "tf")
write_csv(best_reporters, "data/best_reporters.csv")

Show response of best reporters to mutations in their TFBS

Aim: Another proof: show that the prime reporters are responding well to TFBS mutations.

## Extract all reporter_acitvity_neg
neg_activities <- cDNA_df %>%
  dplyr::select(reporter_id, "condition" = cell, reporter_activity_neg) %>%
  mutate(condition = gsub("_.*", "", condition)) %>%
  mutate(reporter_activity_neg = ave(reporter_activity_neg, condition, reporter_id, FUN = function(x) mean(x, na.rm = T))) %>%
  distinct()

## Select best reporter for each TF (keep synthetic if published is better because I only have mutated controls for those)
best_reporters_sel <- cDNA_df_confidence2 %>%
  group_by(tf) %>%
  arrange(desc(conf_level), desc(quality_score)) %>%
  mutate(rank = row_number()) %>%
  filter(rank == 1) %>%
  filter(commercial_reporter == "No") %>%
  ungroup() %>%
  left_join(reporter_features) %>%
  dplyr::select(tf, reporter_id, promoter, spacing, distance, background, commercial_reporter,
           conf_level, quality_score, reporter_activity_minP, condition, cor_pval, reporter_dif_minP,
           perturbation_condition, off_target_activity, off_target_perturbation) %>%
  distinct() %>%
  mutate(condition = gsub("_.*", "", condition))
## Joining, by = c("reporter_id", "tf")
best_reporters_neg <- best_reporters_sel %>%
  left_join(neg_activities) %>%
  mutate(reporter_activity_neg = log2(reporter_activity_neg)) %>%
  #filter(conf_level > 1) %>%
  filter(!tf %in% c("GRHL1", "GATA4"))
## Joining, by = c("reporter_id", "condition")
## Plot response per TF
ggplot(best_reporters_neg %>%
         filter(conf_level > 1),
       aes(x = reorder(tf, -reporter_activity_neg), y = -reporter_activity_neg)) +
  geom_hline(yintercept = 0) +
  geom_bar(stat = "identity", fill = "#2278B5", color = "black") +
  theme_pubr(x.text.angle = 90, legend = "none")

ggplot(best_reporters_neg %>%
         filter(conf_level > 1),
       aes(x = reorder(tf, -reporter_activity_neg), y = -reporter_activity_neg)) +
  geom_hline(yintercept = 0) +
  geom_bar(stat = "identity", fill = "#2278B5", color = "black") +
  theme_pubr(x.text.angle = 90, legend = "none")

## Figure 6F: Plot original and mutated reporter activity
reporter_activity <- best_reporters_neg %>%
  dplyr::select(reporter_id, "start_activity" = reporter_activity_minP) %>%
  distinct()

ggplot(best_reporters_neg %>%
         mutate(reporter_activity_neg = reporter_activity_minP - reporter_activity_neg) %>%
         filter(conf_level > 1) %>%
         left_join(reporter_activity) %>%
         distinct(tf, reporter_activity_minP, reporter_activity_neg, start_activity) %>%
         pivot_longer(c(-tf, -start_activity), names_to = "activity_kind", values_to = "activity"),
       aes(x = reorder(tf, -start_activity), y = activity)) +
  geom_hline(yintercept = 0) +
  geom_point(aes(color = activity_kind)) +
  geom_line() +
  scale_color_manual(values = c("reporter_activity_minP" = "#2278B5", "reporter_activity_neg" = "grey")) +
  theme_pubr(x.text.angle = 90, legend = "none") +
  ggtitle("Figure 6F: Reporter activity of original and mutated TFBS")
## Joining, by = "reporter_id"

## Compare primer reporters vs. no primer reporters
best_reporters_all <- cDNA_df_confidence2 %>%
  group_by(tf) %>%
  arrange(desc(conf_level), desc(quality_score)) %>%
  mutate(rank = row_number()) %>%
  filter(commercial_reporter == "No") %>%
  ungroup() %>%
  distinct(tf, reporter_id, conf_level, rank, condition) %>%
  mutate(condition = gsub("_.*", "", condition)) %>%
  left_join(neg_activities) %>%
  mutate(reporter_activity_neg = log2(reporter_activity_neg)) %>%
  group_by(tf) %>%
  mutate(max_rank = max(rank)) %>%
  ungroup() %>%
  filter(rank == max_rank) %>%
  dplyr::select(reporter_id, reporter_activity_neg) %>%
  mutate(rank = "no_prime")
## Joining, by = c("reporter_id", "condition")
best_reporters_all <- cDNA_df_confidence2 %>%
  group_by(tf) %>%
  arrange(desc(conf_level), desc(quality_score)) %>%
  mutate(rank = row_number()) %>%
  filter(commercial_reporter == "No") %>%
  ungroup() %>%
  distinct(tf, reporter_id, conf_level, rank, condition) %>%
  mutate(condition = gsub("_.*", "", condition)) %>%
  left_join(neg_activities) %>%
  mutate(reporter_activity_neg = log2(reporter_activity_neg)) %>%
  group_by(tf) %>%
  mutate(max_conf_level = max(conf_level) - 1) %>%
  ungroup() %>%
  filter(conf_level == max_conf_level) %>%
  mutate(reporter_activity_neg = ave(reporter_activity_neg, tf, FUN = mean)) %>%
  dplyr::select("reporter_id" = tf, reporter_activity_neg) %>%
  distinct() %>%
  mutate(rank = "no_prime")
## Joining, by = c("reporter_id", "condition")
best_reporters_all_2 <- cDNA_df_confidence2 %>%
  group_by(tf) %>%
  arrange(desc(conf_level), desc(quality_score)) %>%
  mutate(rank = row_number()) %>%
  filter(commercial_reporter == "No") %>%
  ungroup() %>%
  distinct(tf, reporter_id, conf_level, rank, condition) %>%
  mutate(condition = gsub("_.*", "", condition)) %>%
  left_join(neg_activities) %>%
  mutate(reporter_activity_neg = log2(reporter_activity_neg)) %>%
  filter(conf_level > 1, rank == 1) %>%
  filter(!tf %in% c("GRHL1", "GATA4")) %>%
  dplyr::select(reporter_id, reporter_activity_neg) %>%
  mutate(rank = "prime")
## Joining, by = c("reporter_id", "condition")
best_reporters_all_3 <- best_reporters_all_2 %>%
  rbind(best_reporters_all)

## Figure S7
ggplot(best_reporters_all_3,
       aes(x = rank, y = reporter_activity_neg)) +
  geom_hline(yintercept = 0, lty = 2) +
  geom_hline(yintercept = 1) +
  geom_boxplot(outlier.shape = NA, varwidth = .5) +
  geom_quasirandom() +
  theme_pubr(legend = "none") +
  ggtitle("Figure S7: Mutation response of prime reporters vs. reporters with lower confidence level")
## Warning: Removed 2 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 2 rows containing missing values (`position_quasirandom()`).

Conclusion: Prime reporters have increased mutation sensitivity.


Explore activities of prime reporters in all cell types

Aim: Demonstrate the added value of using prime reporters.

## Select set of best reporters
optimal_reporters <- cDNA_df_confidence2 %>%
  group_by(tf) %>%
  arrange(desc(conf_level), desc(quality_score)) %>%
  mutate(rank = row_number()) %>%
  filter(rank == 1 & conf_level >= 2)

### Activity across the 9 cell types
heatmap_cell_activities <- cDNA_df %>%
  filter(reporter_id %in% optimal_reporters$reporter_id) %>%
  dplyr::filter(neg_ctrls == "No", 
                stimulation == "no",
                hPGK == "No", 
                str_detect(tf, "RANDOM", negate = T), 
                native_enhancer == "No") %>%
  distinct(tf, reporter_activity_minP, cell) %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  mutate(reporter_activity_minP = ave(reporter_activity_minP, tf, cell, FUN = mean)) %>%
  mutate(max_activity = ave(reporter_activity_minP, tf, FUN = function(x) max(x, na.rm = T))) %>%
  mutate(min_activity = ave(reporter_activity_minP, tf, FUN = function(x) min(x, na.rm = T))) %>% 
  #mutate(reporter_activity_minP_min_max = (reporter_activity_minP -  min_activity) / (max_activity - min_activity)) %>%
  mutate(reporter_activity_minP = reporter_activity_minP / max_activity) %>%
  dplyr::select(-max_activity, -min_activity) %>%
  distinct() %>%
  spread(key = "cell", value = "reporter_activity_minP") %>% 
  replace(is.na(.), 0) %>%
  column_to_rownames("tf")

myBreaks <- c(seq(-.01, 0, length.out=50+1), 
              seq(0.000001, 1, length.out=50))

hc <- hclust(dist(heatmap_cell_activities), method = "ward.D2")
order_dendogram <- data.frame("order" = 1:nrow(heatmap_cell_activities),
                              "tf" = "")
cluster_tfs <- data.frame("cluster" = cutree(hc, k = 6)) %>%
  rownames_to_column("tf")

for(i in unique(order_dendogram$order)) {
  order_dendogram$tf[order_dendogram$order == i] <- rownames(heatmap_cell_activities[hc$order[i],])
}

cluster_tfs2 <- cluster_tfs %>%
  column_to_rownames("tf")

## Figure 7A: Activities of prime reporters per cell type
p <- pheatmap(heatmap_cell_activities %>% 
             as.matrix(),
         color = colorRampPalette(c("#F2CC8F", "white", "#3D405B"))(100),
         clustering_method = "ward.D2",
         border_color = F, 
         cellwidth = 8, 
         cellheight = 8,
         angle_col = 90,
         cutree_rows = 6,
         annotation_row = cluster_tfs2,
         breaks = myBreaks,
         main = "Figure 7A: Activities of prime reporters per cell type")

## Include max activities per TF
heatmap_cell_activities_max <- cDNA_df %>%
  filter(reporter_id %in% optimal_reporters$reporter_id) %>%
  dplyr::filter(neg_ctrls == "No", 
                stimulation == "no",
                hPGK == "No", 
                str_detect(tf, "RANDOM", negate = T), 
                native_enhancer == "No") %>%
  distinct(tf, reporter_activity_minP, cell) %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  mutate(reporter_activity_minP = ave(reporter_activity_minP, tf, cell, FUN = mean)) %>%
  mutate(max_activity = log2(ave(reporter_activity_minP, tf, FUN = function(x) max(x, na.rm = T)))) %>%
  distinct(tf, max_activity) %>%
  replace(is.na(.), 0) %>%
  column_to_rownames("tf") %>%
  as.matrix() %>%
  t() %>%
  .[,rownames(heatmap_cell_activities)]

# Extract hierarchical clustering from plot_scores3
hc <- p[["tree_row"]]
  
#hc$order <- 1:nrow(heatmap_cell_activities)

myBreaks <- c(seq(-.01, 0, length.out=50+1), 
              seq(0.000001, 8, length.out=50))

pheatmap(heatmap_cell_activities_max %>% 
           as.matrix(),
         color = colorRampPalette(c("#F2CC8F", "white", "#A5A58D"))(100),
         cluster_rows = hc, 
         cluster_cols = F,
         border_color = F, 
         cellwidth = 8, 
         cellheight = 8,
         cutree_rows = 6,
         angle_col = 90,
         breaks = myBreaks)

### UMAP analysis
set.seed(76) 
heatmap_cell_activities_cor <- cor(heatmap_cell_activities %>% t())
tf_umap <- umap(heatmap_cell_activities_cor) 
## as(<dgTMatrix>, "dgCMatrix") is deprecated since Matrix 1.5-0; do as(., "CsparseMatrix") instead
colnames(tf_umap$layout) <- c("A","B") 
tf_umap_plot <- tf_umap$layout %>% as.data.frame() %>% rownames_to_column(var = "tf") %>% left_join(cluster_tfs) #Import data from previous line
## Joining, by = "tf"
# Figure S8B: UMAP visualization of Figure 7A (export 7x7)
ggplot(tf_umap_plot,
       aes(A,B)) +
  #stat_density2d(geom = "polygon", aes(alpha = ..level..), bins = 8) +
  theme_pubr() + 
  geom_point(size = 4, aes(color = as.factor(cluster))) + 
  theme(panel.grid = element_blank()) + 
  scale_alpha(range = c(0.05,0.15)) +
  geom_text_repel(aes(label = tf), force = 60) + 
  ylab("UMAP2") + 
  xlab("UMAP1") +
  ggtitle("Figure S8B: UMAP visualization of Figure 7A")
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Same with RNA-seq
### UMAP analysis
tf_rna_sel <- tf_rna %>%
  filter(tf %in% rownames(heatmap_cell_activities)) %>%
  mutate(max_nTPM = ave(nTPM, tf, FUN = function(x) max(x, na.rm = T))) %>%
  mutate(min_nTPM = ave(nTPM, tf, FUN = function(x) min(x, na.rm = T))) %>% 
  mutate(nTPM = (nTPM -  min_nTPM) / (max_nTPM - min_nTPM)) %>%
  distinct(tf, cell, nTPM) %>%
  spread(key = "cell", value = "nTPM") %>%
  column_to_rownames("tf")
  
tf_rna_sel_cor <- cor(tf_rna_sel %>% t())
tf_umap_rna <- umap(tf_rna_sel_cor) 
colnames(tf_umap_rna$layout) <- c("A","B") 
tf_umap_rna_plot <- tf_umap_rna$layout %>% as.data.frame() %>% rownames_to_column(var = "tf") %>% left_join(cluster_tfs) #Import data from previous line
## Joining, by = "tf"
# UMAP based on RNA-seq (export 7x7)
ggplot(tf_umap_rna_plot,
       aes(A,B)) +
  #stat_density2d(geom = "polygon", aes(alpha = ..level..), bins = 8) +
  theme_pubr() + 
  geom_point(size = 4, aes(color = as.factor(cluster))) + 
  theme(panel.grid = element_blank()) + 
  scale_alpha(range = c(0.05,0.15)) +
  geom_text_repel(aes(label = tf), force = 60) + 
  ylab("UMAP2") + 
  xlab("UMAP1") +
  ggtitle("UMAP based on RNA-seq alone")

Conclusion: Prime reporters reflect cell type-specificity of TFs.


Same heatmap with only published reporters

## Select set of best reporters
optimal_reporters <- cDNA_df_confidence2 %>%
  filter(commercial_reporter == "Yes") %>%
  group_by(tf) %>%
  arrange(desc(conf_level), desc(quality_score)) %>%
  mutate(rank = row_number()) %>%
  filter(rank == 1)

### Activity across the 9 cell types
heatmap_cell_activities <- cDNA_df %>%
  filter(reporter_id %in% optimal_reporters$reporter_id) %>%
  dplyr::filter(neg_ctrls == "No", 
                stimulation == "no",
                hPGK == "No", 
                str_detect(tf, "RANDOM", negate = T), 
                native_enhancer == "No") %>%
  distinct(tf, reporter_activity_minP, cell) %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  mutate(reporter_activity_minP = ave(reporter_activity_minP, tf, cell, FUN = mean)) %>%
  mutate(max_activity = ave(reporter_activity_minP, tf, FUN = function(x) max(x, na.rm = T))) %>%
  mutate(min_activity = ave(reporter_activity_minP, tf, FUN = function(x) min(x, na.rm = T))) %>% 
  mutate(reporter_activity_minP = (reporter_activity_minP -  min_activity) / (max_activity - min_activity)) %>%
  dplyr::select(-max_activity, -min_activity) %>%
  distinct() %>%
  spread(key = "cell", value = "reporter_activity_minP") %>% 
  replace(is.na(.), 0) %>%
  column_to_rownames("tf")

myBreaks <- c(seq(-.01, 0, length.out=50+1), 
              seq(0.000001, 1, length.out=50))

hc <- hclust(dist(heatmap_cell_activities), method = "ward.D2")
order_dendogram <- data.frame("order" = 1:nrow(heatmap_cell_activities),
                              "tf" = "")
cluster_tfs <- data.frame("cluster" = cutree(hc, k = 6)) %>%
  rownames_to_column("tf")

for(i in unique(order_dendogram$order)) {
  order_dendogram$tf[order_dendogram$order == i] <- rownames(heatmap_cell_activities[hc$order[i],])
}

cluster_tfs2 <- cluster_tfs %>%
  column_to_rownames("tf")

## Figure 6A: Activities of top 3 reporters per cell type
p <- pheatmap(heatmap_cell_activities %>% 
           as.matrix(),
         color = colorRampPalette(c("#F2CC8F", "white", "#3D405B"))(100),
         clustering_method = "ward.D2",
         border_color = F, 
         cellwidth = 8, 
         cellheight = 8,
         angle_col = 90,
         cutree_rows = 6,
         annotation_row = cluster_tfs2,
         breaks = myBreaks)

## Compare published to synthetic
cell_activities_comp <- cDNA_df %>%
  filter(reporter_id %in% c("HNF4A_TF-seq_NA_21bp_mCMV_NA", "SOX2_romanov_NA_10bp_mCMV_NA",
                            "HNF4A_10bp_10bp_mCMV_3", "SOX2_5bp_21bp_mCMV_1")) %>%
  dplyr::filter(neg_ctrls == "No", 
                stimulation == "no",
                hPGK == "No", 
                str_detect(tf, "RANDOM", negate = T), 
                native_enhancer == "No") %>%
  distinct(tf, reporter_activity_minP, cell, commercial_reporter) %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  mutate(reporter_activity_minP = ave(reporter_activity_minP, tf, commercial_reporter, cell, FUN = mean)) %>%
  mutate(max_activity = ave(reporter_activity_minP, tf, commercial_reporter, FUN = function(x) max(x, na.rm = T))) %>%
  mutate(min_activity = ave(reporter_activity_minP, tf, commercial_reporter, FUN = function(x) min(x, na.rm = T))) %>% 
  mutate(reporter_activity_minP = (reporter_activity_minP) / (max_activity)) %>%
  dplyr::select(-max_activity, -min_activity) %>%
  distinct() 

cell_activities_comp2 <- cell_activities_comp %>%
  filter(commercial_reporter == "No") %>%
  dplyr::select(tf, cell, "synth_activity" = reporter_activity_minP)

cell_activities_comp <- cell_activities_comp %>%
  left_join(cell_activities_comp2)
## Joining, by = c("cell", "tf")
## Figure S8C
ggplot(cell_activities_comp %>%
         filter(tf == "HNF4A"),
       aes(x = reorder(cell, -synth_activity), y = reporter_activity_minP, color = commercial_reporter, group = commercial_reporter)) + 
  geom_line() +
  geom_point() +
  theme_pubr(x.text.angle = 90) +
  ggtitle("Figure S8C: Cell type-specificity of HNF4A reporters: prime vs. published")


Explore activities of prime reporters upon KDs in HepG2

### Activities upon KD
optimal_reporters <- cDNA_df_confidence2 %>%
  group_by(tf) %>%
  arrange(desc(conf_level), desc(quality_score)) %>%
  mutate(rank = row_number()) %>%
  filter(rank == 1 & conf_level >= 2)

cDNA_df2 <- cDNA_df %>%
  filter(reporter_id %in% optimal_reporters$reporter_id) %>%
  dplyr::filter(neg_ctrls == "No", 
                #commercial_reporter == "No", 
                hPGK == "No", 
                str_detect(tf, "RANDOM", negate = T), 
                native_enhancer == "No") %>%
  distinct(tf, condition, reporter_dif_minP, stimulation, cell) %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  distinct()

cDNA_df3 <- cDNA_df2 %>%
  filter(cell %in% c("HepG2")) %>%
  filter(stimulation == "KD", !tf %in% c("FOXO1", "ATF2")) %>%
  mutate(reporter_dif_minP = ave(reporter_dif_minP, tf, condition, FUN = mean)) #%>%
  #mutate(reporter_dif_minP = ifelse(abs(reporter_dif_minP) > 0.2, reporter_dif_minP, 0))

cDNA_df3[is.na(cDNA_df3)] <- 0

cDNA_df4 <- cDNA_df3 %>%
  distinct(tf, condition, reporter_dif_minP, cell) %>%
  mutate(condition = gsub("HepG2_", "", condition)) %>%
  filter(condition %in% tf) %>%
  filter(tf %in% condition) %>%
  mutate(diag = ifelse(tf == condition, T, F)) 

cDNA_df5 <- cDNA_df4 %>%
  filter(!condition %in% c("TP53", "REL", "RARA", "GRHL1", "FOS::JUN"), 
         !tf %in% c("TP53", "REL", "RARA", "GRHL1", "FOS::JUN")) %>%
  arrange(diag)

cDNA_df5_eff_conditions <- cDNA_df5 %>%
  filter(diag == T & reporter_dif_minP < -.25)
 
ggplot(cDNA_df5,
       aes(x = tf, y = condition, fill = reporter_dif_minP)) +
  geom_tile(color = cDNA_df5$diag, size = .5) +
  scale_color_manual(values = c("black", "white")) +
  scale_fill_gradient2(low = "#99B2DD",
                        mid = "white",
                        high = "#dd6b48",
                       limits = c(-2,2),
                       oob = squish) +
  coord_fixed() +
  xlab("TF reporter") +
  ylab("KD condition") +
  theme_pubr(border = T, x.text.angle = 90) 

overview_effects <- cDNA_df3 %>%
  distinct(tf, condition, reporter_dif_minP, cell) %>%
  mutate(max_reporter_dif = ave(reporter_dif_minP, condition, FUN = function(x) max(abs(x), na.rm = T))) %>%
  mutate(max_reporter_dif_tf = ave(reporter_dif_minP, tf, FUN = function(x) max(abs(x), na.rm = T))) %>%
  left_join(tf_rna) %>%
  filter(nTPM > 2, max_reporter_dif >= 1) %>%
  mutate(condition = gsub("HepG2_", "", condition)) %>%
  filter(condition %in% cDNA_df5_eff_conditions$condition) %>%
  filter(max_reporter_dif_tf >= 1 | (tf %in% condition)) %>%
  distinct(tf, condition, reporter_dif_minP) %>%
  filter(tf != "STAT1")
## Joining, by = c("cell", "tf")
overview_effects2 <- overview_effects %>%
  pivot_wider(names_from = "condition", values_from = "reporter_dif_minP") %>%
  column_to_rownames("tf") %>%
  replace(is.na(.), 0)

myBreaks <- c(seq(-1.5, 0, length.out=50+1), 
              seq(0.000001, 1.5, length.out=50))


## Figure S8D
pheatmap(overview_effects2,
         color = colorRampPalette(c("#99B2DD", "white", "#E07A5F"))(100),
         clustering_method = "ward.D2",
         border_color = F, 
         cellwidth = 10, 
         cellheight = 10,
         angle_col = 90,
         #cutree_cols = 4,
         #cutree_rows = 8,
         breaks = myBreaks)

### Interaction analysis
threshold <- .75

interesting_tfs <- c("ETS2", "HNF1A", "NRF1", "PAX6", "SP1", "SRF", "STAT3", "TCF7", "STAT1::STAT2", "THRB", 
                     "GATA4", "SMAD4", "HNF4A", "NR1H4::RXRA")

interaction_df <- overview_effects %>%
  filter(condition %in% interesting_tfs, tf %in% interesting_tfs) %>%
  dplyr::select("receiver" = tf, "signal" = condition, reporter_dif_minP) %>%
  distinct() %>%
  mutate(TF_dif_binary = "0") %>%
  mutate(TF_dif_binary = ifelse(reporter_dif_minP > threshold, "-1", TF_dif_binary)) %>%
  mutate(TF_dif_binary = ifelse(reporter_dif_minP < -threshold, "1", TF_dif_binary)) %>%
  mutate(TF_dif_binary = ifelse(reporter_dif_minP > -threshold & reporter_dif_minP < threshold, "0", TF_dif_binary)) %>%
  mutate(TF_dif_binary = as.numeric(TF_dif_binary)) %>%
  mutate(TF_dif_binary = (ifelse(receiver == signal, 0, TF_dif_binary))) %>%
  mutate(sum_receiver = ave(TF_dif_binary, receiver, FUN = function(x) sum(x))) %>%
  mutate(sum_signal = ave(TF_dif_binary, signal, FUN = function(x) sum(x))) %>%
  mutate(sum_receiver_size = ave(TF_dif_binary, receiver, FUN = function(x) sum(abs(x)))) %>%
  mutate(sum_signal_size = ave(TF_dif_binary, signal, FUN = function(x) sum(abs(x))))

interaction_matrix <- interaction_df %>%
  distinct(TF_dif_binary, receiver, signal) %>%
  spread(key = "receiver", value = "TF_dif_binary") %>%
  column_to_rownames("signal")

interaction_nodes1 <- interaction_df %>%
  filter(signal == receiver) %>%
  distinct(receiver, signal, sum_signal, sum_receiver, sum_signal_size, sum_receiver_size) %>%
  mutate(sum_total = sum_receiver_size + sum_signal_size) %>%
  mutate(class = ifelse(sum_signal <= 0, "repressor", "activator")) %>%
  distinct(receiver, sum_total, class)

### Add only-signal
interaction_nodes2 <- interaction_df %>%
  filter(!signal %in% unique(interaction_nodes1$receiver)) %>%
  dplyr::select("receiver" = signal, "sum_total" = sum_signal_size, sum_signal) %>%
  mutate(class = ifelse(sum_signal <= 0, "repressor", "activator")) %>%
  dplyr::select(-sum_signal) %>%
  distinct()

### Add only-receiver
interaction_nodes3 <- interaction_df %>%
  filter(!receiver %in% unique(interaction_nodes1$receiver)) %>%
  dplyr::select(receiver, "sum_total" = sum_receiver_size, sum_receiver) %>%
  mutate(class = ifelse(sum_receiver <= 0, "repressor", "activator")) %>%
  dplyr::select(-sum_receiver) %>%
  distinct()

interaction_nodes <- rbind(interaction_nodes1, interaction_nodes2, interaction_nodes3)

# interaction_edges <- interaction_df %>%
#   left_join(target_tf %>% dplyr::select("signal" = condition, target)) %>%
#   filter(signal != receiver) %>%
#   distinct(receiver, signal, TF_dif_binary, TF_dif) %>%
#   mutate(weight = 0) %>%
#   mutate(weight = ifelse(TF_dif_binary == -1, 1, weight)) %>%
#   mutate(weight = ifelse(TF_dif_binary == 1, 2, weight)) %>%
#   mutate(class = ifelse(weight == 1, "repressor", "activator")) %>%
#   dplyr::select(signal, receiver, weight, class, TF_dif) %>%
#   filter(weight != 0) %>%
#   mutate(TF_dif = abs(TF_dif))

interaction_edges <- interaction_df %>%
  filter(receiver != signal) %>%
  distinct(receiver, signal, TF_dif_binary, reporter_dif_minP) %>%
  mutate(weight = 0) %>%
  mutate(weight = ifelse(TF_dif_binary == -1, 1, weight)) %>%
  mutate(weight = ifelse(TF_dif_binary == 1, 2, weight)) %>%
  mutate(class = ifelse(weight == 1, "repressor", "activator")) %>%
  dplyr::select(signal, receiver, weight, class, reporter_dif_minP) %>%
  filter(weight != 0) %>%
  mutate(reporter_dif_minP = abs(reporter_dif_minP))

interaction_nodes <- interaction_nodes %>%
  filter(receiver %in% c(unique(interaction_edges$receiver),unique(interaction_edges$signal)))

routes_igraph <- graph_from_data_frame(d = interaction_edges, vertices = interaction_nodes, directed = TRUE)

visIgraph(routes_igraph) %>%
    visNodes(size = 25, shape = "circle") %>%
    visOptions(highlightNearest = list(enabled = TRUE, hover = TRUE),
               nodesIdSelection = TRUE) %>%
    visInteraction(keyboard = TRUE)
ggraph(routes_igraph, layout = "sugiyama") +
  geom_node_point(aes(size = sum_total, color = class)) +
  geom_edge_link(aes(color = class, width = reporter_dif_minP),
                 alpha = 0.8,
                 arrow = arrow(length = unit(3, 'mm')),
                 start_cap = circle(2, 'mm'),
                 end_cap = circle(3, 'mm')) +
  scale_edge_width(range = c(0.2, 2)) +
  scale_radius(limits = c(1, NA), range = c(1, 6)) +
  geom_node_text(aes(label = name), repel = TRUE) +
  scale_edge_color_manual(values = c("activator" = "#DD6B47", "repressor" = "#0077B6")) +
  scale_color_manual(values = c("activator" = "#DD6B47", "repressor" = "#0077B6")) +
  theme_graph()
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.

ggraph(routes_igraph, layout = "stress") +
  geom_node_point(aes(size = sum_total), color = "grey") +
  geom_edge_fan(aes(color = class), width = .25, start_cap = circle(2, 'mm'), end_cap = circle(3, 'mm')) +
  scale_radius(limits = c(1, NA), range = c(1, 6)) +
  geom_node_text(aes(label = name), repel = TRUE) +
  scale_edge_color_manual(values = c("activator" = colors_diverse[2], "repressor" = colors_diverse[1])) +
  scale_color_manual(values = c("activator" = colors_diverse[2], "repressor" = colors_diverse[1])) +
  theme_graph()

Conclusion: Many other TFs change their activity upon KD - this is a rich dataset to further explore and validate!


Connect changes in TF activities upon HEPG2 KD to changes in RNA levels

Aim: Show differences between RNA-seq and TF activity.

# Load RNA-seq data
KD_tf_changes <- read_csv("/DATA/usr/m.trauernicht/projects/SuRE-TF/data/gene_counts_tpm_KD.csv") %>%
  dplyr::select(condition, "tf" = external_gene_id, TPM_fc_mean) %>%
  distinct() %>%
  mutate(tf = ifelse(tf == "AHR", "AHR::ARNT", tf)) %>%
  mutate(tf = ifelse(tf == "NR1H4", "NR1H4::RXRA", tf)) %>%
  mutate(tf = ifelse(tf == "STAT1", "STAT1::STAT2", tf)) %>%
  mutate(tf = ifelse(tf == "FOS", "FOS::JUN", tf)) %>%
  filter(condition != "NT")
## New names:
## Rows: 195484 Columns: 8
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): external_gene_id, sample, condition, replicate dbl (4): ...1, TPM, TPM_fc,
## TPM_fc_mean
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
overview_effects_KD <- cDNA_df3 %>%
  distinct(condition, tf, reporter_dif_minP) %>%
  mutate(condition = gsub(".*_", "", condition)) %>%
  filter(condition %in% c("SP1", "NRF1", "HNF1A", "MTF1")) %>%
  left_join(KD_tf_changes) %>%
  left_join(tf_rna %>% filter(cell == "HepG2") %>% distinct(tf, nTPM)) %>%
  filter(nTPM > 1 | tf %in% c("SP1", "NRF1", "HNF1A", "MTF1")) %>%
  na.omit() %>%
  mutate(TPM_fc_mean = log2(TPM_fc_mean)) %>%
  mutate(target = ifelse(tf == condition, T, F))
## Joining, by = c("condition", "tf")
## Joining, by = "tf"
ggplot(overview_effects_KD, aes(y = TPM_fc_mean, x = reporter_dif_minP, color = target)) +
  geom_abline(lty = 2) +
  geom_hline(yintercept = 0, lty = 1) +
  geom_vline(xintercept = 0, lty = 1) +
  geom_point() +
  geom_smooth(method = "lm") +
  facet_wrap(~condition, scales = "free") +
  scale_color_manual(values = c("TRUE" = "red", "FALSE" = "black")) +
  theme_pubr()
## `geom_smooth()` using formula = 'y ~ x'

Interesting observations, that would need further validations.


Explore perturbation response of prime reporters in mESCs

optimal_reporters <- cDNA_df_confidence2 %>%
  group_by(tf) %>%
  arrange(desc(conf_level), desc(quality_score)) %>%
  mutate(rank = row_number()) %>%
  filter(rank == 1 & conf_level >= 2 & tf != "RFX1")

cDNA_df2 <- cDNA_df %>%
  filter(reporter_id %in% optimal_reporters$reporter_id) %>%
  dplyr::filter(neg_ctrls == "No", 
                #commercial_reporter == "No", 
                hPGK == "No", 
                str_detect(tf, "RANDOM", negate = T), 
                native_enhancer == "No") %>%
  distinct(tf, condition, reporter_dif_minP, stimulation, cell) %>%
  mutate(tf = gsub("_.*", "", tf)) %>%
  mutate(reporter_dif_minP = ave(reporter_dif_minP, tf, condition, FUN = function(x) mean(x))) %>%
  distinct()

# All mES perturbations: KDs, overexpressions, stimulations, degron
cDNA_df2_mES <- cDNA_df2 %>%
  filter(cell == "mES", stimulation != "no")

cDNA_df2_mES2_filt <- cDNA_df2_mES %>%
  left_join(tf_rna) %>%
  filter((nTPM > 4 & !tf %in% c("TP53", "ATF4", "ATF2", "IRF3", "NFE2L2")
          )
         | tf %in% c("EGR1", "FOXA1") 
         ) %>%
  dplyr::select(-TPM, -TPM_norm, -nTPM, -cell, -stimulation)
## Joining, by = c("cell", "tf")
### Remove conditions that don't seem to induce any changes
cDNA_df2_mES2_filt2 <- cDNA_df2_mES2_filt %>%
  mutate(reporter_dif_max = ave(reporter_dif_minP, condition, FUN = function(x) max(x, na.rm = T))) %>%
  mutate(reporter_dif_min = ave(reporter_dif_minP, condition, FUN = function(x) min(x, na.rm = T))) %>%
  filter(reporter_dif_max > .75 | reporter_dif_min < -.75) %>%
  filter(!condition %in% c("mES_KLF9", "mES_GRHL1", "mES_HSF1", "mES_CREB1", "mES_RXRA", "mES_KLF4", "mES_SOX9",
                            "mES_TEAD2", "mES_POU2F1", "mES_MEF2B", "mES_TCF7", "mES_SOX2", "mES_RFX1-OE", "mES_GATA1-OE", 
                            "mES_OTX2-OE", "mES_EGR1", "mES_MEF2A",
                           "mES_NFIA", "mES_SOX13", "mES_E2F1", "mES_HSF2", "mES_TCF7L1", 
                           "mES_BX", "mES_MEF2A", "mES_PPARG::RXRA", "mES_HIC2", "mES_PPARA::RXRA", "mES_EGR1",
                           "mES_HQ", "mES_GLI1-OE", "mES_FOSL1-OE", "mES_ATF2", "mES_NR4A2-OE-cDIM12",
                           "mES_Nutlin", "mES_IWP2", "mES_TCF7L2", "mES_Dox", "mES_FOS::JUN",
                           "mES_POU5F1","mES_vitC", "mES_FK", "mES_NR1H2",
                           "mES_SP1", "mES_TEAD1", "mES_TFCP2L1", "mES_FOXA1-OE", "mES_NR4A2-OE", "mES_N2B27"))

cDNA_df2_mES2_heatmap <- cDNA_df2_mES2_filt %>%
  filter(condition %in% cDNA_df2_mES2_filt2$condition) %>%
  mutate(max_reporter_dif = ave(reporter_dif_minP, tf, FUN = function (x) max(abs(x), na.rm = T))) %>%
  filter(max_reporter_dif > 1) %>%
  dplyr::select(-max_reporter_dif) %>%
  distinct() %>%
  pivot_wider(names_from = "condition", values_from = "reporter_dif_minP") %>%
  replace(is.na(.), 0) %>%
  column_to_rownames("tf") 

myBreaks <- c(seq(-2, 0, length.out=50+1), 
              seq(0.000001, 2, length.out=50))


## Figure 7B
pheatmap(t(cDNA_df2_mES2_heatmap),
         color = colorRampPalette(c("#99B2DD", "white", "#E07A5F"))(100),
         clustering_method = "ward.D2",
         border_color = F, 
         cellwidth = 10, 
         cellheight = 10,
         #annotation_col = tf_clusters, 
         angle_col = 90,
         cutree_cols = 4,
         cutree_rows = 4,
         breaks = myBreaks,
         main = "Figure 7B: mES perturbations")

cDNA_df2_mES2_heatmap_foc <- cDNA_df2_mES2_filt %>%
  filter(condition %in% cDNA_df2_mES2_filt2$condition) %>%
  mutate(max_reporter_dif = ave(reporter_dif_minP, tf, FUN = function (x) max(abs(x), na.rm = T))) %>%
  filter(max_reporter_dif > 1) %>%
  dplyr::select(-max_reporter_dif) %>%
  distinct()

for (i in unique(cDNA_df2_mES2_heatmap_foc$condition)) {
  p <- ggplot(cDNA_df2_mES2_heatmap_foc %>%
                filter(condition == i),
       aes(x = reorder(tf, reporter_dif_minP), y = reporter_dif_minP)) +
  geom_bar(stat = "identity") +
  theme_pubr(x.text.angle = 90) +
    ggtitle(i)
  
  print(p)
}

Conclusion: Signaling pathways seem to be interdependent. SOX2 and POU5F1 affect a lot of other TF activities.


Visualize perturbation responses in an interaction network

threshold <- 1

## Generate a binary heatmap, all values below absolute threshold should be 0, all other should be set to 1 or -1
cDNA_df2_mES2_heatmap_bin <- cDNA_df2_mES2_heatmap %>%
  mutate_all(~ifelse(abs(.) < threshold, 0, .)) %>%
  mutate_all(~ifelse(. < -threshold, -1, .)) %>%
  mutate_all(~ifelse(. > threshold, 1, .))

pheatmap(cDNA_df2_mES2_heatmap_bin,
         color = colorRampPalette(c("#99B2DD", "white", "#E07A5F"))(100),
         clustering_method = "ward.D2",
         border_color = F, 
         cellwidth = 10, 
         cellheight = 10,
         #annotation_col = tf_clusters, 
         angle_col = 90,
         cutree_cols = 4,
         cutree_rows = 4,
         breaks = myBreaks)

target_tf <- read_csv("data/target_tf.csv")
## Rows: 10 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): condition, target
## dbl (1): effect_size
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cDNA_df4 <- cDNA_df2_mES2_filt %>%
  filter(condition %in% cDNA_df2_mES2_filt2$condition) %>%
  filter(tf %in% c(rownames(cDNA_df2_mES2_heatmap), "NANOG")) %>%
  filter(tf != "POU5F1::SOX2") %>%
  left_join(target_tf) %>%
  na.omit() %>%
  distinct(tf, condition, reporter_dif_minP, target, effect_size) %>%
  mutate(reporter_dif_minP = ifelse(effect_size == 1, -reporter_dif_minP, reporter_dif_minP))
## Joining, by = "condition"
  # filter(target %in% tf) %>%
  # filter(tf %in% target) %>%
  # mutate(diag = ifelse(tf == target, T, F)) %>%
  # arrange(diag)

interaction_df <- cDNA_df4 %>%
  dplyr::select("receiver" = tf, "signal" = target, reporter_dif_minP) %>%
  distinct() %>%
  mutate(TF_dif_binary = "0") %>%
  mutate(TF_dif_binary = ifelse(reporter_dif_minP > threshold, "-1", TF_dif_binary)) %>%
  mutate(TF_dif_binary = ifelse(reporter_dif_minP < -threshold, "1", TF_dif_binary)) %>%
  mutate(TF_dif_binary = ifelse(reporter_dif_minP > -threshold & reporter_dif_minP < threshold, "0", TF_dif_binary)) %>%
  mutate(TF_dif_binary = as.numeric(TF_dif_binary)) %>%
  mutate(TF_dif_binary = (ifelse(receiver == signal, 0, TF_dif_binary))) %>%
  mutate(sum_receiver = ave(TF_dif_binary, receiver, FUN = function(x) sum(x))) %>%
  mutate(sum_signal = ave(TF_dif_binary, signal, FUN = function(x) sum(x))) %>%
  mutate(sum_receiver_size = ave(TF_dif_binary, receiver, FUN = function(x) sum(abs(x)))) %>%
  mutate(sum_signal_size = ave(TF_dif_binary, signal, FUN = function(x) sum(abs(x))))


interaction_matrix <- interaction_df %>%
  distinct(TF_dif_binary, receiver, signal) %>%
  spread(key = "receiver", value = "TF_dif_binary") %>%
  column_to_rownames("signal")

pheatmap(interaction_matrix,
         border_color = NA,
         color = colorRampPalette(c("#d5bdaf", 'white', "#84a98c"))(100),
         clustering_method = "ward.D")

###
interaction_nodes1 <- interaction_df %>%
  filter(signal == receiver) %>%
  distinct(receiver, signal, sum_signal, sum_receiver, sum_signal_size, sum_receiver_size) %>%
  mutate(sum_total = sum_receiver_size + sum_signal_size) %>%
  mutate(class = ifelse(sum_signal <= 0, "repressor", "activator")) %>%
  distinct(receiver, sum_total, class)

### Add only-signal
interaction_nodes2 <- interaction_df %>%
  filter(!signal %in% unique(interaction_nodes1$receiver)) %>%
  dplyr::select("receiver" = signal, "sum_total" = sum_signal_size, sum_signal) %>%
  mutate(class = ifelse(sum_signal <= 0, "repressor", "activator")) %>%
  dplyr::select(-sum_signal) %>%
  distinct()

### Add only-receiver
interaction_nodes3 <- interaction_df %>%
  filter(!receiver %in% unique(interaction_nodes1$receiver)) %>%
  dplyr::select(receiver, "sum_total" = sum_receiver_size, sum_receiver) %>%
  mutate(class = ifelse(sum_receiver <= 0, "repressor", "activator")) %>%
  dplyr::select(-sum_receiver) %>%
  distinct()

interaction_nodes <- rbind(interaction_nodes1, interaction_nodes2, interaction_nodes3)

interaction_edges <- interaction_df %>%
  left_join(target_tf %>% dplyr::select("signal" = condition, target)) %>%
  filter(signal != receiver) %>%
  distinct(receiver, signal, TF_dif_binary, reporter_dif_minP) %>%
  mutate(weight = 0) %>%
  mutate(weight = ifelse(TF_dif_binary == -1, 1, weight)) %>%
  mutate(weight = ifelse(TF_dif_binary == 1, 2, weight)) %>%
  mutate(class = ifelse(weight == 1, "repressor", "activator")) %>%
  dplyr::select(signal, receiver, weight, class, reporter_dif_minP) %>%
  filter(weight != 0) %>%
  mutate(reporter_dif_minP = abs(reporter_dif_minP)) #%>%
## Joining, by = "signal"
  #mutate(signal = gsub("POU5F1::SOX2", "POU5F1", signal)) %>%
  #mutate(receiver = gsub("POU5F1::SOX2", "POU5F1", receiver))

interaction_edges <- interaction_edges #%>%
  #filter(!(signal == "POU5F1::SOX2" & receiver == "SOX2")) %>% ## Remove interactions caused by off-target
  #filter(!(signal == "SOX2" & receiver == "POU5F1::SOX2")) 

interaction_nodes <- interaction_nodes %>%
  #mutate(receiver = gsub("POU5F1::SOX2", "POU5F1", receiver)) %>%
  filter(receiver %in% c(unique(interaction_edges$receiver),unique(interaction_edges$signal)))

interaction_nodes <- interaction_nodes %>%
  mutate(color = ifelse(class == "repressor", "grey90", "#9AC1AE"))

interaction_edges <- interaction_edges %>%
  mutate(color = ifelse(class == "repressor", "grey90", "#9AC1AE"))

routes_igraph <- graph_from_data_frame(d = interaction_edges, vertices = interaction_nodes, directed = TRUE)

visIgraph(routes_igraph) %>%
    visNodes(size = 25, shape = "circle") %>%
    visOptions(highlightNearest = list(enabled = TRUE, hover = TRUE),
               nodesIdSelection = TRUE) %>%
    visInteraction(keyboard = TRUE)
ggraph(routes_igraph, layout = "stress") +
  geom_node_point(aes(size = sum_total, color = class)) +
  geom_edge_fan(aes(color = class, width = reporter_dif_minP)) +
  scale_edge_width(range = c(0.2, 2)) +
  scale_radius(limits = c(1, NA), range = c(1, 6)) +
  geom_node_text(aes(label = name), repel = TRUE) +
  scale_edge_color_manual(values = c("activator" = colors_diverse[2], "repressor" = colors_diverse[1])) +
  scale_color_manual(values = c("activator" = colors_diverse[2], "repressor" = colors_diverse[1])) +
  theme_graph()

ggraph(routes_igraph, layout = "stress") +
  geom_node_point(aes(size = sum_total, color = class)) +
  geom_edge_fan(aes(color = class, width = reporter_dif_minP)) +
  scale_edge_width(range = c(0.2, 2)) +
  scale_radius(limits = c(1, NA), range = c(1, 6)) +
  geom_node_text(aes(label = name), repel = TRUE) +
  scale_edge_color_manual(values = c("activator" = colors_diverse[2], "repressor" = colors_diverse[1])) +
  scale_color_manual(values = c("activator" = colors_diverse[2], "repressor" = colors_diverse[1])) +
  theme_graph()

ggraph(routes_igraph, layout = "stress") +
  geom_node_point(aes(size = sum_total, color = class)) +
  geom_edge_fan(aes(color = class, width = reporter_dif_minP)) +
  scale_edge_width(range = c(0.2, 2)) +
  scale_radius(limits = c(1, NA), range = c(1, 6)) +
  geom_node_text(aes(label = name), repel = TRUE) +
  scale_edge_color_manual(values = c("activator" = colors_diverse[2], "repressor" = colors_diverse[1])) +
  scale_color_manual(values = c("activator" = colors_diverse[2], "repressor" = colors_diverse[1])) +
  theme_graph()

Conclusion: Interesting networks that put POU5F1 at the center.


Compare changes in reporter activity upon SOX2 degradation to changes in TF abundance

# Load degron TT-seq data
sox2_degr <- readRDS("/DATA/usr/m.trauernicht/projects/SuRE-TF/data/DE_SOX2_degron.rds")

# extract genes
genes_sox2 <- as.data.frame(sox2_degr$rowRanges@elementMetadata)

# Filter for only genes
genes_sox2_filt <- genes_sox2 %>%
  filter(anno == "Single Gene") %>%
  dplyr::select(feat_id, "gene" = symbol) %>%
  mutate(gene = toupper(gene))

# differential expression at 6H
sox2_degr_6 <- as.data.frame(sox2_degr$`6h`) %>%
  rownames_to_column("feat_id") %>%
  left_join(genes_sox2_filt) %>%
  filter(!is.na(gene)) %>%
  distinct(log2FoldChange, gene) %>%
  mutate(gene = ifelse(gene == "NR4A2", "NR4A2::RXRA", gene)) %>%
  mutate(gene = ifelse(gene == "AHR", "AHR::ARNT", gene)) %>%
  mutate(gene = ifelse(gene == "FOS", "FOS::JUN", gene)) %>%
  mutate(gene = ifelse(gene == "NFE2", "MAF::NFE2", gene)) %>%
  mutate(gene = ifelse(gene == "NR1H4", "NR1H4::RXRA", gene)) %>%
  mutate(gene = ifelse(gene == "TRP53", "TP53", gene))
  
# Load TF reporter data
cDNA_df2_mES_sox2 <- cDNA_df2_mES %>%
  filter(condition == "mES_Sox2_AID") %>%
  dplyr::select("gene" = tf, reporter_dif_minP) %>%
  left_join(sox2_degr_6)
  
# Plot correlation
ggplot(cDNA_df2_mES_sox2, aes(y = log2FoldChange, x = reporter_dif_minP)) +
  geom_abline(lty = 2) +
  geom_hline(yintercept = 0, lty = 1) +
  geom_vline(xintercept = 0, lty = 1) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_pubr()

Conclusion: No correlation between changes in reporter activity and changes in TF abundance upon SOX2 degradation.


Session Info

paste("Run time: ",format(Sys.time()-StartTime))
## [1] "Run time:  1.767837 mins"
getwd()
## [1] "/DATA/usr/m.trauernicht/projects/SuRE-TF"
date()
## [1] "Wed May 15 16:54:57 2024"
sessionInfo()
## R version 4.0.5 (2021-03-31)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
##  [1] parallel  stats4    grid      stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] GenomicRanges_1.42.0 GenomeInfoDb_1.26.7  IRanges_2.24.1      
##  [4] S4Vectors_0.28.1     BiocGenerics_0.36.1  ggrepel_0.9.1       
##  [7] umap_0.2.8.0         AMR_2.0.0            plotly_4.10.0       
## [10] ggiraph_0.8.6        ggh4x_0.2.3          biomaRt_2.46.3      
## [13] IHW_1.18.0           ggbiplot_0.55        scales_1.2.0        
## [16] readr_2.1.2          ggrastr_1.0.1        randomForest_4.6-14 
## [19] stringr_1.4.0        tidyr_1.2.0          pROC_1.18.0         
## [22] gridExtra_2.3        cowplot_1.1.1        ggraph_2.0.6        
## [25] igraph_1.3.5         plyr_1.8.7           viridis_0.6.2       
## [28] viridisLite_0.4.0    ggforce_0.3.3        ggbeeswarm_0.6.0    
## [31] visNetwork_2.1.0     ggpubr_0.4.0         pheatmap_1.0.12     
## [34] tibble_3.1.6         maditr_0.8.3         dplyr_1.0.8         
## [37] ggplot2_3.4.0        RColorBrewer_1.1-3  
## 
## loaded via a namespace (and not attached):
##   [1] uuid_1.1-0             backports_1.4.1        BiocFileCache_1.14.0  
##   [4] systemfonts_1.0.4      lazyeval_0.2.2         splines_4.0.5         
##   [7] lpsymphony_1.18.0      digest_0.6.29          htmltools_0.5.2       
##  [10] fansi_1.0.3            magrittr_2.0.3         memoise_2.0.1         
##  [13] tzdb_0.3.0             graphlayouts_0.8.0     vroom_1.5.7           
##  [16] askpass_1.1            prettyunits_1.1.1      colorspace_2.0-3      
##  [19] blob_1.2.3             rappdirs_0.3.3         xfun_0.30             
##  [22] RCurl_1.98-1.6         crayon_1.5.1           jsonlite_1.8.0        
##  [25] glue_1.6.2             polyclip_1.10-0        gtable_0.3.0          
##  [28] zlibbioc_1.36.0        XVector_0.30.0         car_3.0-12            
##  [31] abind_1.4-5            DBI_1.1.2              rstatix_0.7.0         
##  [34] Rcpp_1.0.8.3           progress_1.2.2         reticulate_1.24       
##  [37] bit_4.0.4              htmlwidgets_1.5.4      httr_1.4.2            
##  [40] ellipsis_0.3.2         pkgconfig_2.0.3        XML_3.99-0.9          
##  [43] farver_2.1.0           sass_0.4.1             dbplyr_2.1.1          
##  [46] utf8_1.2.2             tidyselect_1.1.2       labeling_0.4.2        
##  [49] rlang_1.0.6            AnnotationDbi_1.52.0   munsell_0.5.0         
##  [52] tools_4.0.5            cachem_1.0.6           cli_3.4.1             
##  [55] generics_0.1.2         RSQLite_2.2.12         broom_0.8.0           
##  [58] fdrtool_1.2.17         evaluate_0.15          fastmap_1.1.0         
##  [61] yaml_2.3.5             knitr_1.38             bit64_4.0.5           
##  [64] tidygraph_1.2.1        purrr_0.3.4            nlme_3.1-152          
##  [67] slam_0.1-50            xml2_1.3.3             compiler_4.0.5        
##  [70] rstudioapi_0.13        beeswarm_0.4.0         curl_4.3.2            
##  [73] png_0.1-7              ggsignif_0.6.3         tweenr_1.0.2          
##  [76] bslib_0.3.1            stringi_1.7.6          highr_0.9             
##  [79] RSpectra_0.16-1        lattice_0.20-41        Matrix_1.5-1          
##  [82] vctrs_0.5.1            pillar_1.7.0           lifecycle_1.0.3       
##  [85] jquerylib_0.1.4        bitops_1.0-7           data.table_1.14.2     
##  [88] R6_2.5.1               vipor_0.4.5            MASS_7.3-53.1         
##  [91] assertthat_0.2.1       openssl_2.0.0          withr_2.5.0           
##  [94] GenomeInfoDbData_1.2.4 mgcv_1.8-34            hms_1.1.1             
##  [97] rmarkdown_2.13         carData_3.0-5          Cairo_1.5-15          
## [100] Biobase_2.50.0